From 50013ca9a57c42114044f593c981bbad8c405cc9 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 3 Aug 2022 11:43:36 +0300 Subject: [PATCH] intel: add a hasvk vulkan driver This new driver is a copy of the current Anv code, it will only load on gfx7/8 platforms though. Signed-off-by: Lionel Landwerlin Acked-by: Jason Ekstrand Acked-by: Jason Ekstrand Acked-by: Jason Ekstrand Part-of: --- meson.build | 7 +- meson_options.txt | 2 +- src/intel/meson.build | 3 + src/intel/vulkan_hasvk/TODO | 13 + .../vulkan_hasvk/anv_acceleration_structure.c | 251 + src/intel/vulkan_hasvk/anv_allocator.c | 2176 +++++ src/intel/vulkan_hasvk/anv_android.c | 792 ++ src/intel/vulkan_hasvk/anv_android.h | 57 + src/intel/vulkan_hasvk/anv_android_stubs.c | 63 + src/intel/vulkan_hasvk/anv_batch_chain.c | 2477 ++++++ src/intel/vulkan_hasvk/anv_blorp.c | 1995 +++++ src/intel/vulkan_hasvk/anv_bo_sync.c | 237 + src/intel/vulkan_hasvk/anv_cmd_buffer.c | 1112 +++ src/intel/vulkan_hasvk/anv_descriptor_set.c | 2046 +++++ src/intel/vulkan_hasvk/anv_device.c | 4834 +++++++++++ src/intel/vulkan_hasvk/anv_formats.c | 1745 ++++ src/intel/vulkan_hasvk/anv_gem.c | 405 + src/intel/vulkan_hasvk/anv_gem_stubs.c | 187 + src/intel/vulkan_hasvk/anv_genX.h | 180 + src/intel/vulkan_hasvk/anv_image.c | 2973 +++++++ src/intel/vulkan_hasvk/anv_measure.c | 516 ++ src/intel/vulkan_hasvk/anv_measure.h | 82 + src/intel/vulkan_hasvk/anv_nir.h | 97 + .../anv_nir_add_base_work_group_id.c | 63 + .../anv_nir_apply_pipeline_layout.c | 1686 ++++ .../anv_nir_compute_push_layout.c | 290 + .../vulkan_hasvk/anv_nir_lower_multiview.c | 324 + .../vulkan_hasvk/anv_nir_lower_ubo_loads.c | 124 + .../anv_nir_lower_ycbcr_textures.c | 349 + src/intel/vulkan_hasvk/anv_perf.c | 488 ++ src/intel/vulkan_hasvk/anv_pipeline.c | 3300 ++++++++ src/intel/vulkan_hasvk/anv_pipeline_cache.c | 380 + src/intel/vulkan_hasvk/anv_private.h | 4303 ++++++++++ src/intel/vulkan_hasvk/anv_queue.c | 75 + src/intel/vulkan_hasvk/anv_util.c | 92 + src/intel/vulkan_hasvk/anv_utrace.c | 346 + src/intel/vulkan_hasvk/anv_wsi.c | 118 + src/intel/vulkan_hasvk/genX_blorp_exec.c | 410 + src/intel/vulkan_hasvk/genX_cmd_buffer.c | 7488 +++++++++++++++++ src/intel/vulkan_hasvk/genX_gpu_memcpy.c | 324 + src/intel/vulkan_hasvk/genX_pipeline.c | 2563 ++++++ src/intel/vulkan_hasvk/genX_query.c | 1530 ++++ src/intel/vulkan_hasvk/genX_state.c | 1141 +++ src/intel/vulkan_hasvk/gfx7_cmd_buffer.c | 314 + src/intel/vulkan_hasvk/gfx8_cmd_buffer.c | 706 ++ src/intel/vulkan_hasvk/meson.build | 265 + .../tests/block_pool_grow_first.c | 67 + .../vulkan_hasvk/tests/block_pool_no_free.c | 153 + src/intel/vulkan_hasvk/tests/state_pool.c | 59 + .../tests/state_pool_free_list_only.c | 68 + .../vulkan_hasvk/tests/state_pool_no_free.c | 119 + .../vulkan_hasvk/tests/state_pool_padding.c | 79 + .../tests/state_pool_test_helper.h | 71 + src/intel/vulkan_hasvk/tests/test_common.h | 34 + 54 files changed, 49575 insertions(+), 4 deletions(-) create mode 100644 src/intel/vulkan_hasvk/TODO create mode 100644 src/intel/vulkan_hasvk/anv_acceleration_structure.c create mode 100644 src/intel/vulkan_hasvk/anv_allocator.c create mode 100644 src/intel/vulkan_hasvk/anv_android.c create mode 100644 src/intel/vulkan_hasvk/anv_android.h create mode 100644 src/intel/vulkan_hasvk/anv_android_stubs.c create mode 100644 src/intel/vulkan_hasvk/anv_batch_chain.c create mode 100644 src/intel/vulkan_hasvk/anv_blorp.c create mode 100644 src/intel/vulkan_hasvk/anv_bo_sync.c create mode 100644 src/intel/vulkan_hasvk/anv_cmd_buffer.c create mode 100644 src/intel/vulkan_hasvk/anv_descriptor_set.c create mode 100644 src/intel/vulkan_hasvk/anv_device.c create mode 100644 src/intel/vulkan_hasvk/anv_formats.c create mode 100644 src/intel/vulkan_hasvk/anv_gem.c create mode 100644 src/intel/vulkan_hasvk/anv_gem_stubs.c create mode 100644 src/intel/vulkan_hasvk/anv_genX.h create mode 100644 src/intel/vulkan_hasvk/anv_image.c create mode 100644 src/intel/vulkan_hasvk/anv_measure.c create mode 100644 src/intel/vulkan_hasvk/anv_measure.h create mode 100644 src/intel/vulkan_hasvk/anv_nir.h create mode 100644 src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c create mode 100644 src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c create mode 100644 src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_multiview.c create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c create mode 100644 src/intel/vulkan_hasvk/anv_perf.c create mode 100644 src/intel/vulkan_hasvk/anv_pipeline.c create mode 100644 src/intel/vulkan_hasvk/anv_pipeline_cache.c create mode 100644 src/intel/vulkan_hasvk/anv_private.h create mode 100644 src/intel/vulkan_hasvk/anv_queue.c create mode 100644 src/intel/vulkan_hasvk/anv_util.c create mode 100644 src/intel/vulkan_hasvk/anv_utrace.c create mode 100644 src/intel/vulkan_hasvk/anv_wsi.c create mode 100644 src/intel/vulkan_hasvk/genX_blorp_exec.c create mode 100644 src/intel/vulkan_hasvk/genX_cmd_buffer.c create mode 100644 src/intel/vulkan_hasvk/genX_gpu_memcpy.c create mode 100644 src/intel/vulkan_hasvk/genX_pipeline.c create mode 100644 src/intel/vulkan_hasvk/genX_query.c create mode 100644 src/intel/vulkan_hasvk/genX_state.c create mode 100644 src/intel/vulkan_hasvk/gfx7_cmd_buffer.c create mode 100644 src/intel/vulkan_hasvk/gfx8_cmd_buffer.c create mode 100644 src/intel/vulkan_hasvk/meson.build create mode 100644 src/intel/vulkan_hasvk/tests/block_pool_grow_first.c create mode 100644 src/intel/vulkan_hasvk/tests/block_pool_no_free.c create mode 100644 src/intel/vulkan_hasvk/tests/state_pool.c create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_no_free.c create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_padding.c create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_test_helper.h create mode 100644 src/intel/vulkan_hasvk/tests/test_common.h diff --git a/meson.build b/meson.build index bec326c462f..1f310a24537 100644 --- a/meson.build +++ b/meson.build @@ -250,7 +250,7 @@ _vulkan_drivers = get_option('vulkan-drivers') if _vulkan_drivers.contains('auto') if system_has_kms_drm if host_machine.cpu_family().startswith('x86') - _vulkan_drivers = ['amd', 'intel', 'swrast'] + _vulkan_drivers = ['amd', 'intel', 'intel_hasvk', 'swrast'] elif ['arm', 'aarch64'].contains(host_machine.cpu_family()) _vulkan_drivers = ['swrast'] elif ['mips', 'mips64', 'riscv32', 'riscv64'].contains(host_machine.cpu_family()) @@ -269,6 +269,7 @@ if _vulkan_drivers.contains('auto') endif with_intel_vk = _vulkan_drivers.contains('intel') +with_intel_hasvk = _vulkan_drivers.contains('intel_hasvk') with_amd_vk = _vulkan_drivers.contains('amd') with_freedreno_vk = _vulkan_drivers.contains('freedreno') with_panfrost_vk = _vulkan_drivers.contains('panfrost') @@ -283,7 +284,7 @@ with_microsoft_vk = _vulkan_drivers.contains('microsoft-experimental') with_any_vk = _vulkan_drivers.length() != 0 with_any_broadcom = with_gallium_vc4 or with_gallium_v3d or with_broadcom_vk -with_any_intel = with_intel_vk or with_gallium_iris or with_gallium_crocus or with_intel_tools +with_any_intel = with_intel_vk or with_intel_hasvk or with_gallium_iris or with_gallium_crocus or with_intel_tools if with_swrast_vk and not with_gallium_softpipe error('swrast vulkan requires gallium swrast') @@ -1549,7 +1550,7 @@ endif if cc.has_function('dl_iterate_phdr') pre_args += '-DHAVE_DL_ITERATE_PHDR' -elif with_intel_vk +elif with_intel_vk or with_intel_hasvk error('Intel "Anvil" Vulkan driver requires the dl_iterate_phdr function') endif diff --git a/meson_options.txt b/meson_options.txt index 2a5d9fddf6b..283a02bc934 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -198,7 +198,7 @@ option( 'vulkan-drivers', type : 'array', value : ['auto'], - choices : ['auto', 'amd', 'broadcom', 'freedreno', 'imagination-experimental', 'intel', 'microsoft-experimental', 'panfrost', 'swrast', 'virtio-experimental'], + choices : ['auto', 'amd', 'broadcom', 'freedreno', 'imagination-experimental', 'intel', 'intel_hasvk', 'microsoft-experimental', 'panfrost', 'swrast', 'virtio-experimental'], description : 'List of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built' ) option( diff --git a/src/intel/meson.build b/src/intel/meson.build index 5d177553d89..ad77e11e3bb 100644 --- a/src/intel/meson.build +++ b/src/intel/meson.build @@ -38,3 +38,6 @@ endif if with_intel_vk subdir('vulkan') endif +if with_intel_hasvk + subdir('vulkan_hasvk') +endif diff --git a/src/intel/vulkan_hasvk/TODO b/src/intel/vulkan_hasvk/TODO new file mode 100644 index 00000000000..4c41e251888 --- /dev/null +++ b/src/intel/vulkan_hasvk/TODO @@ -0,0 +1,13 @@ +Intel Vulkan ToDo +================= + +Missing Features: + - Investigate CTS failures on HSW + - Sparse memory + +Performance: + - Multi-{sampled/gfx8,LOD} HiZ + - MSAA fast clears + - Pushing pieces of UBOs? + - Enable guardband clipping + - Use soft-pin to avoid relocations diff --git a/src/intel/vulkan_hasvk/anv_acceleration_structure.c b/src/intel/vulkan_hasvk/anv_acceleration_structure.c new file mode 100644 index 00000000000..f003772e9c1 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_acceleration_structure.c @@ -0,0 +1,251 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +void +anv_GetAccelerationStructureBuildSizesKHR( + VkDevice device, + VkAccelerationStructureBuildTypeKHR buildType, + const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo, + const uint32_t* pMaxPrimitiveCounts, + VkAccelerationStructureBuildSizesInfoKHR* pSizeInfo) +{ + assert(pSizeInfo->sType == + VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR); + + pSizeInfo->accelerationStructureSize = 0; /* TODO */ + + uint64_t cpu_build_scratch_size = 0; /* TODO */ + uint64_t cpu_update_scratch_size = cpu_build_scratch_size; + + uint64_t gpu_build_scratch_size = 0; /* TODO */ + uint64_t gpu_update_scratch_size = gpu_build_scratch_size; + + switch (buildType) { + case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_KHR: + pSizeInfo->buildScratchSize = cpu_build_scratch_size; + pSizeInfo->updateScratchSize = cpu_update_scratch_size; + break; + + case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR: + pSizeInfo->buildScratchSize = gpu_build_scratch_size; + pSizeInfo->updateScratchSize = gpu_update_scratch_size; + break; + + case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_OR_DEVICE_KHR: + pSizeInfo->buildScratchSize = MAX2(cpu_build_scratch_size, + gpu_build_scratch_size); + pSizeInfo->updateScratchSize = MAX2(cpu_update_scratch_size, + gpu_update_scratch_size); + break; + + default: + unreachable("Invalid acceleration structure build type"); + } +} + +VkResult +anv_CreateAccelerationStructureKHR( + VkDevice _device, + const VkAccelerationStructureCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkAccelerationStructureKHR* pAccelerationStructure) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer); + struct anv_acceleration_structure *accel; + + accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (accel == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + vk_object_base_init(&device->vk, &accel->base, + VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR); + + accel->size = pCreateInfo->size; + accel->address = anv_address_add(buffer->address, pCreateInfo->offset); + + *pAccelerationStructure = anv_acceleration_structure_to_handle(accel); + + return VK_SUCCESS; +} + +void +anv_DestroyAccelerationStructureKHR( + VkDevice _device, + VkAccelerationStructureKHR accelerationStructure, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure); + + if (!accel) + return; + + vk_object_base_finish(&accel->base); + vk_free2(&device->vk.alloc, pAllocator, accel); +} + +VkDeviceAddress +anv_GetAccelerationStructureDeviceAddressKHR( + VkDevice device, + const VkAccelerationStructureDeviceAddressInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_acceleration_structure, accel, + pInfo->accelerationStructure); + + assert(!anv_address_is_null(accel->address)); + assert(anv_bo_is_pinned(accel->address.bo)); + + return anv_address_physical(accel->address); +} + +void +anv_GetDeviceAccelerationStructureCompatibilityKHR( + VkDevice device, + const VkAccelerationStructureVersionInfoKHR* pVersionInfo, + VkAccelerationStructureCompatibilityKHR* pCompatibility) +{ + unreachable("Unimplemented"); +} + +VkResult +anv_BuildAccelerationStructuresKHR( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +anv_CopyAccelerationStructureKHR( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +anv_CopyAccelerationStructureToMemoryKHR( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +anv_CopyMemoryToAccelerationStructureKHR( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkResult +anv_WriteAccelerationStructuresPropertiesKHR( + VkDevice _device, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR* pAccelerationStructures, + VkQueryType queryType, + size_t dataSize, + void* pData, + size_t stride) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +void +anv_CmdBuildAccelerationStructuresKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + unreachable("Unimplemented"); +} + +void +anv_CmdBuildAccelerationStructuresIndirectKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + unreachable("Unimplemented"); +} + +void +anv_CmdCopyAccelerationStructureKHR( + VkCommandBuffer commandBuffer, + const VkCopyAccelerationStructureInfoKHR* pInfo) +{ + unreachable("Unimplemented"); +} + +void +anv_CmdCopyAccelerationStructureToMemoryKHR( + VkCommandBuffer commandBuffer, + const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo) +{ + unreachable("Unimplemented"); +} + +void +anv_CmdCopyMemoryToAccelerationStructureKHR( + VkCommandBuffer commandBuffer, + const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo) +{ + unreachable("Unimplemented"); +} + +void +anv_CmdWriteAccelerationStructuresPropertiesKHR( + VkCommandBuffer commandBuffer, + uint32_t accelerationStructureCount, + const VkAccelerationStructureKHR* pAccelerationStructures, + VkQueryType queryType, + VkQueryPool queryPool, + uint32_t firstQuery) +{ + unreachable("Unimplemented"); +} diff --git a/src/intel/vulkan_hasvk/anv_allocator.c b/src/intel/vulkan_hasvk/anv_allocator.c new file mode 100644 index 00000000000..ce64811b178 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_allocator.c @@ -0,0 +1,2176 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" + +#include "common/intel_aux_map.h" +#include "util/anon_file.h" +#include "util/futex.h" + +#ifdef HAVE_VALGRIND +#define VG_NOACCESS_READ(__ptr) ({ \ + VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \ + __typeof(*(__ptr)) __val = *(__ptr); \ + VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\ + __val; \ +}) +#define VG_NOACCESS_WRITE(__ptr, __val) ({ \ + VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \ + *(__ptr) = (__val); \ + VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \ +}) +#else +#define VG_NOACCESS_READ(__ptr) (*(__ptr)) +#define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val)) +#endif + +#ifndef MAP_POPULATE +#define MAP_POPULATE 0 +#endif + +/* Design goals: + * + * - Lock free (except when resizing underlying bos) + * + * - Constant time allocation with typically only one atomic + * + * - Multiple allocation sizes without fragmentation + * + * - Can grow while keeping addresses and offset of contents stable + * + * - All allocations within one bo so we can point one of the + * STATE_BASE_ADDRESS pointers at it. + * + * The overall design is a two-level allocator: top level is a fixed size, big + * block (8k) allocator, which operates out of a bo. Allocation is done by + * either pulling a block from the free list or growing the used range of the + * bo. Growing the range may run out of space in the bo which we then need to + * grow. Growing the bo is tricky in a multi-threaded, lockless environment: + * we need to keep all pointers and contents in the old map valid. GEM bos in + * general can't grow, but we use a trick: we create a memfd and use ftruncate + * to grow it as necessary. We mmap the new size and then create a gem bo for + * it using the new gem userptr ioctl. Without heavy-handed locking around + * our allocation fast-path, there isn't really a way to munmap the old mmap, + * so we just keep it around until garbage collection time. While the block + * allocator is lockless for normal operations, we block other threads trying + * to allocate while we're growing the map. It shouldn't happen often, and + * growing is fast anyway. + * + * At the next level we can use various sub-allocators. The state pool is a + * pool of smaller, fixed size objects, which operates much like the block + * pool. It uses a free list for freeing objects, but when it runs out of + * space it just allocates a new block from the block pool. This allocator is + * intended for longer lived state objects such as SURFACE_STATE and most + * other persistent state objects in the API. We may need to track more info + * with these object and a pointer back to the CPU object (eg VkImage). In + * those cases we just allocate a slightly bigger object and put the extra + * state after the GPU state object. + * + * The state stream allocator works similar to how the i965 DRI driver streams + * all its state. Even with Vulkan, we need to emit transient state (whether + * surface state base or dynamic state base), and for that we can just get a + * block and fill it up. These cases are local to a command buffer and the + * sub-allocator need not be thread safe. The streaming allocator gets a new + * block when it runs out of space and chains them together so they can be + * easily freed. + */ + +/* Allocations are always at least 64 byte aligned, so 1 is an invalid value. + * We use it to indicate the free list is empty. */ +#define EMPTY UINT32_MAX + +/* On FreeBSD PAGE_SIZE is already defined in + * /usr/include/machine/param.h that is indirectly + * included here. + */ +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +struct anv_mmap_cleanup { + void *map; + size_t size; +}; + +static inline uint32_t +ilog2_round_up(uint32_t value) +{ + assert(value != 0); + return 32 - __builtin_clz(value - 1); +} + +static inline uint32_t +round_to_power_of_two(uint32_t value) +{ + return 1 << ilog2_round_up(value); +} + +struct anv_state_table_cleanup { + void *map; + size_t size; +}; + +#define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0}) +#define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry)) + +static VkResult +anv_state_table_expand_range(struct anv_state_table *table, uint32_t size); + +VkResult +anv_state_table_init(struct anv_state_table *table, + struct anv_device *device, + uint32_t initial_entries) +{ + VkResult result; + + table->device = device; + + /* Just make it 2GB up-front. The Linux kernel won't actually back it + * with pages until we either map and fault on one of them or we use + * userptr and send a chunk of it off to the GPU. + */ + table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table"); + if (table->fd == -1) + return vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + + if (!u_vector_init(&table->cleanups, 8, + sizeof(struct anv_state_table_cleanup))) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_fd; + } + + table->state.next = 0; + table->state.end = 0; + table->size = 0; + + uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE; + result = anv_state_table_expand_range(table, initial_size); + if (result != VK_SUCCESS) + goto fail_cleanups; + + return VK_SUCCESS; + + fail_cleanups: + u_vector_finish(&table->cleanups); + fail_fd: + close(table->fd); + + return result; +} + +static VkResult +anv_state_table_expand_range(struct anv_state_table *table, uint32_t size) +{ + void *map; + struct anv_state_table_cleanup *cleanup; + + /* Assert that we only ever grow the pool */ + assert(size >= table->state.end); + + /* Make sure that we don't go outside the bounds of the memfd */ + if (size > BLOCK_POOL_MEMFD_SIZE) + return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY); + + cleanup = u_vector_add(&table->cleanups); + if (!cleanup) + return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY); + + *cleanup = ANV_STATE_TABLE_CLEANUP_INIT; + + /* Just leak the old map until we destroy the pool. We can't munmap it + * without races or imposing locking on the block allocate fast path. On + * the whole the leaked maps adds up to less than the size of the + * current map. MAP_POPULATE seems like the right thing to do, but we + * should try to get some numbers. + */ + map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, table->fd, 0); + if (map == MAP_FAILED) { + return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY, + "mmap failed: %m"); + } + + cleanup->map = map; + cleanup->size = size; + + table->map = map; + table->size = size; + + return VK_SUCCESS; +} + +static VkResult +anv_state_table_grow(struct anv_state_table *table) +{ + VkResult result = VK_SUCCESS; + + uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE, + PAGE_SIZE); + uint32_t old_size = table->size; + + /* The block pool is always initialized to a nonzero size and this function + * is always called after initialization. + */ + assert(old_size > 0); + + uint32_t required = MAX2(used, old_size); + if (used * 2 <= required) { + /* If we're in this case then this isn't the firsta allocation and we + * already have enough space on both sides to hold double what we + * have allocated. There's nothing for us to do. + */ + goto done; + } + + uint32_t size = old_size * 2; + while (size < required) + size *= 2; + + assert(size > table->size); + + result = anv_state_table_expand_range(table, size); + + done: + return result; +} + +void +anv_state_table_finish(struct anv_state_table *table) +{ + struct anv_state_table_cleanup *cleanup; + + u_vector_foreach(cleanup, &table->cleanups) { + if (cleanup->map) + munmap(cleanup->map, cleanup->size); + } + + u_vector_finish(&table->cleanups); + + close(table->fd); +} + +VkResult +anv_state_table_add(struct anv_state_table *table, uint32_t *idx, + uint32_t count) +{ + struct anv_block_state state, old, new; + VkResult result; + + assert(idx); + + while(1) { + state.u64 = __sync_fetch_and_add(&table->state.u64, count); + if (state.next + count <= state.end) { + assert(table->map); + struct anv_free_entry *entry = &table->map[state.next]; + for (int i = 0; i < count; i++) { + entry[i].state.idx = state.next + i; + } + *idx = state.next; + return VK_SUCCESS; + } else if (state.next <= state.end) { + /* We allocated the first block outside the pool so we have to grow + * the pool. pool_state->next acts a mutex: threads who try to + * allocate now will get block indexes above the current limit and + * hit futex_wait below. + */ + new.next = state.next + count; + do { + result = anv_state_table_grow(table); + if (result != VK_SUCCESS) + return result; + new.end = table->size / ANV_STATE_ENTRY_SIZE; + } while (new.end < new.next); + + old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64); + if (old.next != state.next) + futex_wake(&table->state.end, INT_MAX); + } else { + futex_wait(&table->state.end, state.end, NULL); + continue; + } + } +} + +void +anv_free_list_push(union anv_free_list *list, + struct anv_state_table *table, + uint32_t first, uint32_t count) +{ + union anv_free_list current, old, new; + uint32_t last = first; + + for (uint32_t i = 1; i < count; i++, last++) + table->map[last].next = last + 1; + + old.u64 = list->u64; + do { + current = old; + table->map[last].next = current.offset; + new.offset = first; + new.count = current.count + 1; + old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); + } while (old.u64 != current.u64); +} + +struct anv_state * +anv_free_list_pop(union anv_free_list *list, + struct anv_state_table *table) +{ + union anv_free_list current, new, old; + + current.u64 = list->u64; + while (current.offset != EMPTY) { + __sync_synchronize(); + new.offset = table->map[current.offset].next; + new.count = current.count + 1; + old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); + if (old.u64 == current.u64) { + struct anv_free_entry *entry = &table->map[current.offset]; + return &entry->state; + } + current = old; + } + + return NULL; +} + +static VkResult +anv_block_pool_expand_range(struct anv_block_pool *pool, + uint32_t center_bo_offset, uint32_t size); + +VkResult +anv_block_pool_init(struct anv_block_pool *pool, + struct anv_device *device, + const char *name, + uint64_t start_address, + uint32_t initial_size) +{ + VkResult result; + + if (device->info->verx10 >= 125) { + /* Make sure VMA addresses are 2MiB aligned for the block pool */ + assert(anv_is_aligned(start_address, 2 * 1024 * 1024)); + assert(anv_is_aligned(initial_size, 2 * 1024 * 1024)); + } + + pool->name = name; + pool->device = device; + pool->use_relocations = anv_use_relocations(device->physical); + pool->nbos = 0; + pool->size = 0; + pool->center_bo_offset = 0; + pool->start_address = intel_canonical_address(start_address); + pool->map = NULL; + + if (!pool->use_relocations) { + pool->bo = NULL; + pool->fd = -1; + } else { + /* Just make it 2GB up-front. The Linux kernel won't actually back it + * with pages until we either map and fault on one of them or we use + * userptr and send a chunk of it off to the GPU. + */ + pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool"); + if (pool->fd == -1) + return vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + + pool->wrapper_bo = (struct anv_bo) { + .refcount = 1, + .offset = -1, + .is_wrapper = true, + }; + pool->bo = &pool->wrapper_bo; + } + + if (!u_vector_init(&pool->mmap_cleanups, 8, + sizeof(struct anv_mmap_cleanup))) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_fd; + } + + pool->state.next = 0; + pool->state.end = 0; + pool->back_state.next = 0; + pool->back_state.end = 0; + + result = anv_block_pool_expand_range(pool, 0, initial_size); + if (result != VK_SUCCESS) + goto fail_mmap_cleanups; + + /* Make the entire pool available in the front of the pool. If back + * allocation needs to use this space, the "ends" will be re-arranged. + */ + pool->state.end = pool->size; + + return VK_SUCCESS; + + fail_mmap_cleanups: + u_vector_finish(&pool->mmap_cleanups); + fail_fd: + if (pool->fd >= 0) + close(pool->fd); + + return result; +} + +void +anv_block_pool_finish(struct anv_block_pool *pool) +{ + anv_block_pool_foreach_bo(bo, pool) { + assert(bo->refcount == 1); + anv_device_release_bo(pool->device, bo); + } + + struct anv_mmap_cleanup *cleanup; + u_vector_foreach(cleanup, &pool->mmap_cleanups) + munmap(cleanup->map, cleanup->size); + u_vector_finish(&pool->mmap_cleanups); + + if (pool->fd >= 0) + close(pool->fd); +} + +static VkResult +anv_block_pool_expand_range(struct anv_block_pool *pool, + uint32_t center_bo_offset, uint32_t size) +{ + /* Assert that we only ever grow the pool */ + assert(center_bo_offset >= pool->back_state.end); + assert(size - center_bo_offset >= pool->state.end); + + /* Assert that we don't go outside the bounds of the memfd */ + assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER); + assert(!pool->use_relocations || + size - center_bo_offset <= + BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER); + + /* For state pool BOs we have to be a bit careful about where we place them + * in the GTT. There are two documented workarounds for state base address + * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset + * which state that those two base addresses do not support 48-bit + * addresses and need to be placed in the bottom 32-bit range. + * Unfortunately, this is not quite accurate. + * + * The real problem is that we always set the size of our state pools in + * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most + * likely significantly smaller. We do this because we do not no at the + * time we emit STATE_BASE_ADDRESS whether or not we will need to expand + * the pool during command buffer building so we don't actually have a + * valid final size. If the address + size, as seen by STATE_BASE_ADDRESS + * overflows 48 bits, the GPU appears to treat all accesses to the buffer + * as being out of bounds and returns zero. For dynamic state, this + * usually just leads to rendering corruptions, but shaders that are all + * zero hang the GPU immediately. + * + * The easiest solution to do is exactly what the bogus workarounds say to + * do: restrict these buffers to 32-bit addresses. We could also pin the + * BO to some particular location of our choosing, but that's significantly + * more work than just not setting a flag. So, we explicitly DO NOT set + * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the + * hard work for us. When using softpin, we're in control and the fixed + * addresses we choose are fine for base addresses. + */ + enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE; + if (pool->use_relocations) + bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS; + + if (!pool->use_relocations) { + uint32_t new_bo_size = size - pool->size; + struct anv_bo *new_bo; + assert(center_bo_offset == 0); + VkResult result = anv_device_alloc_bo(pool->device, + pool->name, + new_bo_size, + bo_alloc_flags | + ANV_BO_ALLOC_FIXED_ADDRESS | + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + pool->start_address + pool->size, + &new_bo); + if (result != VK_SUCCESS) + return result; + + pool->bos[pool->nbos++] = new_bo; + + /* This pointer will always point to the first BO in the list */ + pool->bo = pool->bos[0]; + } else { + /* Just leak the old map until we destroy the pool. We can't munmap it + * without races or imposing locking on the block allocate fast path. On + * the whole the leaked maps adds up to less than the size of the + * current map. MAP_POPULATE seems like the right thing to do, but we + * should try to get some numbers. + */ + void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, pool->fd, + BLOCK_POOL_MEMFD_CENTER - center_bo_offset); + if (map == MAP_FAILED) + return vk_errorf(pool->device, VK_ERROR_MEMORY_MAP_FAILED, + "mmap failed: %m"); + + struct anv_bo *new_bo; + VkResult result = anv_device_import_bo_from_host_ptr(pool->device, + map, size, + bo_alloc_flags, + 0 /* client_address */, + &new_bo); + if (result != VK_SUCCESS) { + munmap(map, size); + return result; + } + + struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups); + if (!cleanup) { + munmap(map, size); + anv_device_release_bo(pool->device, new_bo); + return vk_error(pool->device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + cleanup->map = map; + cleanup->size = size; + + /* Now that we mapped the new memory, we can write the new + * center_bo_offset back into pool and update pool->map. */ + pool->center_bo_offset = center_bo_offset; + pool->map = map + center_bo_offset; + + pool->bos[pool->nbos++] = new_bo; + pool->wrapper_bo.map = new_bo; + } + + assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS); + pool->size = size; + + return VK_SUCCESS; +} + +/** Returns current memory map of the block pool. + * + * The returned pointer points to the map for the memory at the specified + * offset. The offset parameter is relative to the "center" of the block pool + * rather than the start of the block pool BO map. + */ +void* +anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size) +{ + if (!pool->use_relocations) { + struct anv_bo *bo = NULL; + int32_t bo_offset = 0; + anv_block_pool_foreach_bo(iter_bo, pool) { + if (offset < bo_offset + iter_bo->size) { + bo = iter_bo; + break; + } + bo_offset += iter_bo->size; + } + assert(bo != NULL); + assert(offset >= bo_offset); + assert((offset - bo_offset) + size <= bo->size); + + return bo->map + (offset - bo_offset); + } else { + return pool->map + offset; + } +} + +/** Grows and re-centers the block pool. + * + * We grow the block pool in one or both directions in such a way that the + * following conditions are met: + * + * 1) The size of the entire pool is always a power of two. + * + * 2) The pool only grows on both ends. Neither end can get + * shortened. + * + * 3) At the end of the allocation, we have about twice as much space + * allocated for each end as we have used. This way the pool doesn't + * grow too far in one direction or the other. + * + * 4) If the _alloc_back() has never been called, then the back portion of + * the pool retains a size of zero. (This makes it easier for users of + * the block pool that only want a one-sided pool.) + * + * 5) We have enough space allocated for at least one more block in + * whichever side `state` points to. + * + * 6) The center of the pool is always aligned to both the block_size of + * the pool and a 4K CPU page. + */ +static uint32_t +anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state, + uint32_t contiguous_size) +{ + VkResult result = VK_SUCCESS; + + pthread_mutex_lock(&pool->device->mutex); + + assert(state == &pool->state || state == &pool->back_state); + + /* Gather a little usage information on the pool. Since we may have + * threadsd waiting in queue to get some storage while we resize, it's + * actually possible that total_used will be larger than old_size. In + * particular, block_pool_alloc() increments state->next prior to + * calling block_pool_grow, so this ensures that we get enough space for + * which ever side tries to grow the pool. + * + * We align to a page size because it makes it easier to do our + * calculations later in such a way that we state page-aigned. + */ + uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE); + uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE); + uint32_t total_used = front_used + back_used; + + assert(state == &pool->state || back_used > 0); + + uint32_t old_size = pool->size; + + /* The block pool is always initialized to a nonzero size and this function + * is always called after initialization. + */ + assert(old_size > 0); + + const uint32_t old_back = pool->center_bo_offset; + const uint32_t old_front = old_size - pool->center_bo_offset; + + /* The back_used and front_used may actually be smaller than the actual + * requirement because they are based on the next pointers which are + * updated prior to calling this function. + */ + uint32_t back_required = MAX2(back_used, old_back); + uint32_t front_required = MAX2(front_used, old_front); + + if (!pool->use_relocations) { + /* With softpin, the pool is made up of a bunch of buffers with separate + * maps. Make sure we have enough contiguous space that we can get a + * properly contiguous map for the next chunk. + */ + assert(old_back == 0); + front_required = MAX2(front_required, old_front + contiguous_size); + } + + if (back_used * 2 <= back_required && front_used * 2 <= front_required) { + /* If we're in this case then this isn't the firsta allocation and we + * already have enough space on both sides to hold double what we + * have allocated. There's nothing for us to do. + */ + goto done; + } + + uint32_t size = old_size * 2; + while (size < back_required + front_required) + size *= 2; + + assert(size > pool->size); + + /* We compute a new center_bo_offset such that, when we double the size + * of the pool, we maintain the ratio of how much is used by each side. + * This way things should remain more-or-less balanced. + */ + uint32_t center_bo_offset; + if (back_used == 0) { + /* If we're in this case then we have never called alloc_back(). In + * this case, we want keep the offset at 0 to make things as simple + * as possible for users that don't care about back allocations. + */ + center_bo_offset = 0; + } else { + /* Try to "center" the allocation based on how much is currently in + * use on each side of the center line. + */ + center_bo_offset = ((uint64_t)size * back_used) / total_used; + + /* Align down to a multiple of the page size */ + center_bo_offset &= ~(PAGE_SIZE - 1); + + assert(center_bo_offset >= back_used); + + /* Make sure we don't shrink the back end of the pool */ + if (center_bo_offset < back_required) + center_bo_offset = back_required; + + /* Make sure that we don't shrink the front end of the pool */ + if (size - center_bo_offset < front_required) + center_bo_offset = size - front_required; + } + + assert(center_bo_offset % PAGE_SIZE == 0); + + result = anv_block_pool_expand_range(pool, center_bo_offset, size); + +done: + pthread_mutex_unlock(&pool->device->mutex); + + if (result == VK_SUCCESS) { + /* Return the appropriate new size. This function never actually + * updates state->next. Instead, we let the caller do that because it + * needs to do so in order to maintain its concurrency model. + */ + if (state == &pool->state) { + return pool->size - pool->center_bo_offset; + } else { + assert(pool->center_bo_offset > 0); + return pool->center_bo_offset; + } + } else { + return 0; + } +} + +static uint32_t +anv_block_pool_alloc_new(struct anv_block_pool *pool, + struct anv_block_state *pool_state, + uint32_t block_size, uint32_t *padding) +{ + struct anv_block_state state, old, new; + + /* Most allocations won't generate any padding */ + if (padding) + *padding = 0; + + while (1) { + state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size); + if (state.next + block_size <= state.end) { + return state.next; + } else if (state.next <= state.end) { + if (!pool->use_relocations && state.next < state.end) { + /* We need to grow the block pool, but still have some leftover + * space that can't be used by that particular allocation. So we + * add that as a "padding", and return it. + */ + uint32_t leftover = state.end - state.next; + + /* If there is some leftover space in the pool, the caller must + * deal with it. + */ + assert(leftover == 0 || padding); + if (padding) + *padding = leftover; + state.next += leftover; + } + + /* We allocated the first block outside the pool so we have to grow + * the pool. pool_state->next acts a mutex: threads who try to + * allocate now will get block indexes above the current limit and + * hit futex_wait below. + */ + new.next = state.next + block_size; + do { + new.end = anv_block_pool_grow(pool, pool_state, block_size); + } while (new.end < new.next); + + old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64); + if (old.next != state.next) + futex_wake(&pool_state->end, INT_MAX); + return state.next; + } else { + futex_wait(&pool_state->end, state.end, NULL); + continue; + } + } +} + +int32_t +anv_block_pool_alloc(struct anv_block_pool *pool, + uint32_t block_size, uint32_t *padding) +{ + uint32_t offset; + + offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding); + + return offset; +} + +/* Allocates a block out of the back of the block pool. + * + * This will allocated a block earlier than the "start" of the block pool. + * The offsets returned from this function will be negative but will still + * be correct relative to the block pool's map pointer. + * + * If you ever use anv_block_pool_alloc_back, then you will have to do + * gymnastics with the block pool's BO when doing relocations. + */ +int32_t +anv_block_pool_alloc_back(struct anv_block_pool *pool, + uint32_t block_size) +{ + int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state, + block_size, NULL); + + /* The offset we get out of anv_block_pool_alloc_new() is actually the + * number of bytes downwards from the middle to the end of the block. + * We need to turn it into a (negative) offset from the middle to the + * start of the block. + */ + assert(offset >= 0); + return -(offset + block_size); +} + +VkResult +anv_state_pool_init(struct anv_state_pool *pool, + struct anv_device *device, + const char *name, + uint64_t base_address, + int32_t start_offset, + uint32_t block_size) +{ + /* We don't want to ever see signed overflow */ + assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE); + + uint32_t initial_size = block_size * 16; + if (device->info->verx10 >= 125) + initial_size = MAX2(initial_size, 2 * 1024 * 1024); + + VkResult result = anv_block_pool_init(&pool->block_pool, device, name, + base_address + start_offset, + initial_size); + if (result != VK_SUCCESS) + return result; + + pool->start_offset = start_offset; + + result = anv_state_table_init(&pool->table, device, 64); + if (result != VK_SUCCESS) { + anv_block_pool_finish(&pool->block_pool); + return result; + } + + assert(util_is_power_of_two_or_zero(block_size)); + pool->block_size = block_size; + pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY; + for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) { + pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY; + pool->buckets[i].block.next = 0; + pool->buckets[i].block.end = 0; + } + VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); + + return VK_SUCCESS; +} + +void +anv_state_pool_finish(struct anv_state_pool *pool) +{ + VG(VALGRIND_DESTROY_MEMPOOL(pool)); + anv_state_table_finish(&pool->table); + anv_block_pool_finish(&pool->block_pool); +} + +static uint32_t +anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool, + struct anv_block_pool *block_pool, + uint32_t state_size, + uint32_t block_size, + uint32_t *padding) +{ + struct anv_block_state block, old, new; + uint32_t offset; + + /* We don't always use anv_block_pool_alloc(), which would set *padding to + * zero for us. So if we have a pointer to padding, we must zero it out + * ourselves here, to make sure we always return some sensible value. + */ + if (padding) + *padding = 0; + + /* If our state is large, we don't need any sub-allocation from a block. + * Instead, we just grab whole (potentially large) blocks. + */ + if (state_size >= block_size) + return anv_block_pool_alloc(block_pool, state_size, padding); + + restart: + block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size); + + if (block.next < block.end) { + return block.next; + } else if (block.next == block.end) { + offset = anv_block_pool_alloc(block_pool, block_size, padding); + new.next = offset + state_size; + new.end = offset + block_size; + old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64); + if (old.next != block.next) + futex_wake(&pool->block.end, INT_MAX); + return offset; + } else { + futex_wait(&pool->block.end, block.end, NULL); + goto restart; + } +} + +static uint32_t +anv_state_pool_get_bucket(uint32_t size) +{ + unsigned size_log2 = ilog2_round_up(size); + assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2); + if (size_log2 < ANV_MIN_STATE_SIZE_LOG2) + size_log2 = ANV_MIN_STATE_SIZE_LOG2; + return size_log2 - ANV_MIN_STATE_SIZE_LOG2; +} + +static uint32_t +anv_state_pool_get_bucket_size(uint32_t bucket) +{ + uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2; + return 1 << size_log2; +} + +/** Helper to push a chunk into the state table. + * + * It creates 'count' entries into the state table and update their sizes, + * offsets and maps, also pushing them as "free" states. + */ +static void +anv_state_pool_return_blocks(struct anv_state_pool *pool, + uint32_t chunk_offset, uint32_t count, + uint32_t block_size) +{ + /* Disallow returning 0 chunks */ + assert(count != 0); + + /* Make sure we always return chunks aligned to the block_size */ + assert(chunk_offset % block_size == 0); + + uint32_t st_idx; + UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count); + assert(result == VK_SUCCESS); + for (int i = 0; i < count; i++) { + /* update states that were added back to the state table */ + struct anv_state *state_i = anv_state_table_get(&pool->table, + st_idx + i); + state_i->alloc_size = block_size; + state_i->offset = pool->start_offset + chunk_offset + block_size * i; + state_i->map = anv_block_pool_map(&pool->block_pool, + state_i->offset, + state_i->alloc_size); + } + + uint32_t block_bucket = anv_state_pool_get_bucket(block_size); + anv_free_list_push(&pool->buckets[block_bucket].free_list, + &pool->table, st_idx, count); +} + +/** Returns a chunk of memory back to the state pool. + * + * Do a two-level split. If chunk_size is bigger than divisor + * (pool->block_size), we return as many divisor sized blocks as we can, from + * the end of the chunk. + * + * The remaining is then split into smaller blocks (starting at small_size if + * it is non-zero), with larger blocks always being taken from the end of the + * chunk. + */ +static void +anv_state_pool_return_chunk(struct anv_state_pool *pool, + uint32_t chunk_offset, uint32_t chunk_size, + uint32_t small_size) +{ + uint32_t divisor = pool->block_size; + uint32_t nblocks = chunk_size / divisor; + uint32_t rest = chunk_size - nblocks * divisor; + + if (nblocks > 0) { + /* First return divisor aligned and sized chunks. We start returning + * larger blocks from the end of the chunk, since they should already be + * aligned to divisor. Also anv_state_pool_return_blocks() only accepts + * aligned chunks. + */ + uint32_t offset = chunk_offset + rest; + anv_state_pool_return_blocks(pool, offset, nblocks, divisor); + } + + chunk_size = rest; + divisor /= 2; + + if (small_size > 0 && small_size < divisor) + divisor = small_size; + + uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2; + + /* Just as before, return larger divisor aligned blocks from the end of the + * chunk first. + */ + while (chunk_size > 0 && divisor >= min_size) { + nblocks = chunk_size / divisor; + rest = chunk_size - nblocks * divisor; + if (nblocks > 0) { + anv_state_pool_return_blocks(pool, chunk_offset + rest, + nblocks, divisor); + chunk_size = rest; + } + divisor /= 2; + } +} + +static struct anv_state +anv_state_pool_alloc_no_vg(struct anv_state_pool *pool, + uint32_t size, uint32_t align) +{ + uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align)); + + struct anv_state *state; + uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket); + int32_t offset; + + /* Try free list first. */ + state = anv_free_list_pop(&pool->buckets[bucket].free_list, + &pool->table); + if (state) { + assert(state->offset >= pool->start_offset); + goto done; + } + + /* Try to grab a chunk from some larger bucket and split it up */ + for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) { + state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table); + if (state) { + unsigned chunk_size = anv_state_pool_get_bucket_size(b); + int32_t chunk_offset = state->offset; + + /* First lets update the state we got to its new size. offset and map + * remain the same. + */ + state->alloc_size = alloc_size; + + /* Now return the unused part of the chunk back to the pool as free + * blocks + * + * There are a couple of options as to what we do with it: + * + * 1) We could fully split the chunk into state.alloc_size sized + * pieces. However, this would mean that allocating a 16B + * state could potentially split a 2MB chunk into 512K smaller + * chunks. This would lead to unnecessary fragmentation. + * + * 2) The classic "buddy allocator" method would have us split the + * chunk in half and return one half. Then we would split the + * remaining half in half and return one half, and repeat as + * needed until we get down to the size we want. However, if + * you are allocating a bunch of the same size state (which is + * the common case), this means that every other allocation has + * to go up a level and every fourth goes up two levels, etc. + * This is not nearly as efficient as it could be if we did a + * little more work up-front. + * + * 3) Split the difference between (1) and (2) by doing a + * two-level split. If it's bigger than some fixed block_size, + * we split it into block_size sized chunks and return all but + * one of them. Then we split what remains into + * state.alloc_size sized chunks and return them. + * + * We choose something close to option (3), which is implemented with + * anv_state_pool_return_chunk(). That is done by returning the + * remaining of the chunk, with alloc_size as a hint of the size that + * we want the smaller chunk split into. + */ + anv_state_pool_return_chunk(pool, chunk_offset + alloc_size, + chunk_size - alloc_size, alloc_size); + goto done; + } + } + + uint32_t padding; + offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket], + &pool->block_pool, + alloc_size, + pool->block_size, + &padding); + /* Every time we allocate a new state, add it to the state pool */ + uint32_t idx; + UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1); + assert(result == VK_SUCCESS); + + state = anv_state_table_get(&pool->table, idx); + state->offset = pool->start_offset + offset; + state->alloc_size = alloc_size; + state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size); + + if (padding > 0) { + uint32_t return_offset = offset - padding; + anv_state_pool_return_chunk(pool, return_offset, padding, 0); + } + +done: + return *state; +} + +struct anv_state +anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align) +{ + if (size == 0) + return ANV_STATE_NULL; + + struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align); + VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size)); + return state; +} + +struct anv_state +anv_state_pool_alloc_back(struct anv_state_pool *pool) +{ + struct anv_state *state; + uint32_t alloc_size = pool->block_size; + + /* This function is only used with pools where start_offset == 0 */ + assert(pool->start_offset == 0); + + state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table); + if (state) { + assert(state->offset < pool->start_offset); + goto done; + } + + int32_t offset; + offset = anv_block_pool_alloc_back(&pool->block_pool, + pool->block_size); + uint32_t idx; + UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1); + assert(result == VK_SUCCESS); + + state = anv_state_table_get(&pool->table, idx); + state->offset = pool->start_offset + offset; + state->alloc_size = alloc_size; + state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size); + +done: + VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size)); + return *state; +} + +static void +anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state) +{ + assert(util_is_power_of_two_or_zero(state.alloc_size)); + unsigned bucket = anv_state_pool_get_bucket(state.alloc_size); + + if (state.offset < pool->start_offset) { + assert(state.alloc_size == pool->block_size); + anv_free_list_push(&pool->back_alloc_free_list, + &pool->table, state.idx, 1); + } else { + anv_free_list_push(&pool->buckets[bucket].free_list, + &pool->table, state.idx, 1); + } +} + +void +anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state) +{ + if (state.alloc_size == 0) + return; + + VG(VALGRIND_MEMPOOL_FREE(pool, state.map)); + anv_state_pool_free_no_vg(pool, state); +} + +struct anv_state_stream_block { + struct anv_state block; + + /* The next block */ + struct anv_state_stream_block *next; + +#ifdef HAVE_VALGRIND + /* A pointer to the first user-allocated thing in this block. This is + * what valgrind sees as the start of the block. + */ + void *_vg_ptr; +#endif +}; + +/* The state stream allocator is a one-shot, single threaded allocator for + * variable sized blocks. We use it for allocating dynamic state. + */ +void +anv_state_stream_init(struct anv_state_stream *stream, + struct anv_state_pool *state_pool, + uint32_t block_size) +{ + stream->state_pool = state_pool; + stream->block_size = block_size; + + stream->block = ANV_STATE_NULL; + + /* Ensure that next + whatever > block_size. This way the first call to + * state_stream_alloc fetches a new block. + */ + stream->next = block_size; + + util_dynarray_init(&stream->all_blocks, NULL); + + VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false)); +} + +void +anv_state_stream_finish(struct anv_state_stream *stream) +{ + util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) { + VG(VALGRIND_MEMPOOL_FREE(stream, block->map)); + VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size)); + anv_state_pool_free_no_vg(stream->state_pool, *block); + } + util_dynarray_fini(&stream->all_blocks); + + VG(VALGRIND_DESTROY_MEMPOOL(stream)); +} + +struct anv_state +anv_state_stream_alloc(struct anv_state_stream *stream, + uint32_t size, uint32_t alignment) +{ + if (size == 0) + return ANV_STATE_NULL; + + assert(alignment <= PAGE_SIZE); + + uint32_t offset = align_u32(stream->next, alignment); + if (offset + size > stream->block.alloc_size) { + uint32_t block_size = stream->block_size; + if (block_size < size) + block_size = round_to_power_of_two(size); + + stream->block = anv_state_pool_alloc_no_vg(stream->state_pool, + block_size, PAGE_SIZE); + util_dynarray_append(&stream->all_blocks, + struct anv_state, stream->block); + VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size)); + + /* Reset back to the start */ + stream->next = offset = 0; + assert(offset + size <= stream->block.alloc_size); + } + const bool new_block = stream->next == 0; + + struct anv_state state = stream->block; + state.offset += offset; + state.alloc_size = size; + state.map += offset; + + stream->next = offset + size; + + if (new_block) { + assert(state.map == stream->block.map); + VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size)); + } else { + /* This only updates the mempool. The newly allocated chunk is still + * marked as NOACCESS. */ + VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map, + stream->next)); + /* Mark the newly allocated chunk as undefined */ + VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size)); + } + + return state; +} + +void +anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool, + struct anv_state_pool *parent, + uint32_t count, uint32_t size, uint32_t alignment) +{ + pool->pool = parent; + pool->reserved_blocks = ANV_FREE_LIST_EMPTY; + pool->count = count; + + for (unsigned i = 0; i < count; i++) { + struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment); + anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1); + } +} + +void +anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool) +{ + struct anv_state *state; + + while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) { + anv_state_pool_free(pool->pool, *state); + pool->count--; + } + assert(pool->count == 0); +} + +struct anv_state +anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool) +{ + return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table); +} + +void +anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool, + struct anv_state state) +{ + anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1); +} + +void +anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device, + const char *name) +{ + pool->name = name; + pool->device = device; + for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { + util_sparse_array_free_list_init(&pool->free_list[i], + &device->bo_cache.bo_map, 0, + offsetof(struct anv_bo, free_index)); + } + + VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); +} + +void +anv_bo_pool_finish(struct anv_bo_pool *pool) +{ + for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { + while (1) { + struct anv_bo *bo = + util_sparse_array_free_list_pop_elem(&pool->free_list[i]); + if (bo == NULL) + break; + + /* anv_device_release_bo is going to "free" it */ + VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1)); + anv_device_release_bo(pool->device, bo); + } + } + + VG(VALGRIND_DESTROY_MEMPOOL(pool)); +} + +VkResult +anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size, + struct anv_bo **bo_out) +{ + const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size); + const unsigned pow2_size = 1 << size_log2; + const unsigned bucket = size_log2 - 12; + assert(bucket < ARRAY_SIZE(pool->free_list)); + + struct anv_bo *bo = + util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]); + if (bo != NULL) { + VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); + *bo_out = bo; + return VK_SUCCESS; + } + + VkResult result = anv_device_alloc_bo(pool->device, + pool->name, + pow2_size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_CAPTURE, + 0 /* explicit_address */, + &bo); + if (result != VK_SUCCESS) + return result; + + /* We want it to look like it came from this pool */ + VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0)); + VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); + + *bo_out = bo; + + return VK_SUCCESS; +} + +void +anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo) +{ + VG(VALGRIND_MEMPOOL_FREE(pool, bo->map)); + + assert(util_is_power_of_two_or_zero(bo->size)); + const unsigned size_log2 = ilog2_round_up(bo->size); + const unsigned bucket = size_log2 - 12; + assert(bucket < ARRAY_SIZE(pool->free_list)); + + assert(util_sparse_array_get(&pool->device->bo_cache.bo_map, + bo->gem_handle) == bo); + util_sparse_array_free_list_push(&pool->free_list[bucket], + &bo->gem_handle, 1); +} + +// Scratch pool + +void +anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool) +{ + memset(pool, 0, sizeof(*pool)); +} + +void +anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool) +{ + for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) { + for (unsigned i = 0; i < 16; i++) { + if (pool->bos[i][s] != NULL) + anv_device_release_bo(device, pool->bos[i][s]); + } + } + + for (unsigned i = 0; i < 16; i++) { + if (pool->surf_states[i].map != NULL) { + anv_state_pool_free(&device->surface_state_pool, + pool->surf_states[i]); + } + } +} + +struct anv_bo * +anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool, + gl_shader_stage stage, unsigned per_thread_scratch) +{ + if (per_thread_scratch == 0) + return NULL; + + unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); + assert(scratch_size_log2 < 16); + + assert(stage < ARRAY_SIZE(pool->bos)); + + const struct intel_device_info *devinfo = device->info; + + /* On GFX version 12.5, scratch access changed to a surface-based model. + * Instead of each shader type having its own layout based on IDs passed + * from the relevant fixed-function unit, all scratch access is based on + * thread IDs like it always has been for compute. + */ + if (devinfo->verx10 >= 125) + stage = MESA_SHADER_COMPUTE; + + struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]); + + if (bo != NULL) + return bo; + + assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids)); + uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage]; + + /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they + * are still relative to the general state base address. When we emit + * STATE_BASE_ADDRESS, we set general state base address to 0 and the size + * to the maximum (1 page under 4GB). This allows us to just place the + * scratch buffers anywhere we wish in the bottom 32 bits of address space + * and just set the scratch base pointer in 3DSTATE_*S using a relocation. + * However, in order to do so, we need to ensure that the kernel does not + * place the scratch BO above the 32-bit boundary. + * + * NOTE: Technically, it can't go "anywhere" because the top page is off + * limits. However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the + * kernel allocates space using + * + * end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE); + * + * so nothing will ever touch the top page. + */ + const enum anv_bo_alloc_flags alloc_flags = + devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0; + VkResult result = anv_device_alloc_bo(device, "scratch", size, + alloc_flags, + 0 /* explicit_address */, + &bo); + if (result != VK_SUCCESS) + return NULL; /* TODO */ + + struct anv_bo *current_bo = + p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo); + if (current_bo) { + anv_device_release_bo(device, bo); + return current_bo; + } else { + return bo; + } +} + +uint32_t +anv_scratch_pool_get_surf(struct anv_device *device, + struct anv_scratch_pool *pool, + unsigned per_thread_scratch) +{ + if (per_thread_scratch == 0) + return 0; + + unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); + assert(scratch_size_log2 < 16); + + uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]); + if (surf > 0) + return surf; + + struct anv_bo *bo = + anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE, + per_thread_scratch); + struct anv_address addr = { .bo = bo }; + + struct anv_state state = + anv_state_pool_alloc(&device->surface_state_pool, + device->isl_dev.ss.size, 64); + + isl_buffer_fill_state(&device->isl_dev, state.map, + .address = anv_address_physical(addr), + .size_B = bo->size, + .mocs = anv_mocs(device, bo, 0), + .format = ISL_FORMAT_RAW, + .swizzle = ISL_SWIZZLE_IDENTITY, + .stride_B = per_thread_scratch, + .is_scratch = true); + + uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2], + 0, state.offset); + if (current) { + anv_state_pool_free(&device->surface_state_pool, state); + return current; + } else { + pool->surf_states[scratch_size_log2] = state; + return state.offset; + } +} + +VkResult +anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device) +{ + util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024); + + if (pthread_mutex_init(&cache->mutex, NULL)) { + util_sparse_array_finish(&cache->bo_map); + return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY, + "pthread_mutex_init failed: %m"); + } + + return VK_SUCCESS; +} + +void +anv_bo_cache_finish(struct anv_bo_cache *cache) +{ + util_sparse_array_finish(&cache->bo_map); + pthread_mutex_destroy(&cache->mutex); +} + +#define ANV_BO_CACHE_SUPPORTED_FLAGS \ + (EXEC_OBJECT_WRITE | \ + EXEC_OBJECT_ASYNC | \ + EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \ + EXEC_OBJECT_PINNED | \ + EXEC_OBJECT_CAPTURE) + +static uint32_t +anv_bo_alloc_flags_to_bo_flags(struct anv_device *device, + enum anv_bo_alloc_flags alloc_flags) +{ + struct anv_physical_device *pdevice = device->physical; + + uint64_t bo_flags = 0; + if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) && + pdevice->supports_48bit_addresses) + bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + + if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture) + bo_flags |= EXEC_OBJECT_CAPTURE; + + if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) { + assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC); + bo_flags |= EXEC_OBJECT_WRITE; + } + + if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async) + bo_flags |= EXEC_OBJECT_ASYNC; + + if (pdevice->use_softpin) + bo_flags |= EXEC_OBJECT_PINNED; + + return bo_flags; +} + +static void +anv_bo_finish(struct anv_device *device, struct anv_bo *bo) +{ + if (bo->offset != 0 && anv_bo_is_pinned(bo) && !bo->has_fixed_address) + anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size); + + if (bo->map && !bo->from_host_ptr) + anv_device_unmap_bo(device, bo, bo->map, bo->size); + + assert(bo->gem_handle != 0); + anv_gem_close(device, bo->gem_handle); +} + +static VkResult +anv_bo_vma_alloc_or_close(struct anv_device *device, + struct anv_bo *bo, + enum anv_bo_alloc_flags alloc_flags, + uint64_t explicit_address) +{ + assert(anv_bo_is_pinned(bo)); + assert(explicit_address == intel_48b_address(explicit_address)); + + uint32_t align = 4096; + + /* Gen12 CCS surface addresses need to be 64K aligned. */ + if (device->info->ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) + align = 64 * 1024; + + /* For XeHP, lmem and smem cannot share a single PDE, which means they + * can't live in the same 2MiB aligned region. + */ + if (device->info->verx10 >= 125) + align = 2 * 1024 * 1024; + + if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) { + bo->has_fixed_address = true; + bo->offset = explicit_address; + } else { + bo->offset = anv_vma_alloc(device, bo->size + bo->_ccs_size, + align, alloc_flags, explicit_address); + if (bo->offset == 0) { + anv_bo_finish(device, bo); + return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "failed to allocate virtual address for BO"); + } + } + + return VK_SUCCESS; +} + +VkResult +anv_device_alloc_bo(struct anv_device *device, + const char *name, + uint64_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t explicit_address, + struct anv_bo **bo_out) +{ + if (!device->physical->has_implicit_ccs) + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)); + + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); + assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); + + /* The kernel is going to give us whole pages anyway */ + size = align_u64(size, 4096); + + uint64_t ccs_size = 0; + if (device->info->has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) { + /* Align the size up to the next multiple of 64K so we don't have any + * AUX-TT entries pointing from a 64K page to itself. + */ + size = align_u64(size, 64 * 1024); + + /* See anv_bo::_ccs_size */ + ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096); + } + + uint32_t gem_handle; + + /* If we have vram size, we have multiple memory regions and should choose + * one of them. + */ + if (anv_physical_device_has_vram(device->physical)) { + struct drm_i915_gem_memory_class_instance regions[2]; + uint32_t nregions = 0; + + /* This always try to put the object in local memory. Here + * vram_non_mappable & vram_mappable actually are the same region. + */ + regions[nregions++] = device->physical->vram_non_mappable.region; + + /* If the buffer is mapped on the host, add the system memory region. + * This ensures that if the buffer cannot live in mappable local memory, + * it can be spilled to system memory. + */ + uint32_t flags = 0; + if ((alloc_flags & ANV_BO_ALLOC_MAPPED) || + (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)) { + regions[nregions++] = device->physical->sys.region; + if (device->physical->vram_non_mappable.size > 0) + flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS; + } + + gem_handle = anv_gem_create_regions(device, size + ccs_size, + flags, nregions, regions); + } else { + gem_handle = anv_gem_create(device, size + ccs_size); + } + + if (gem_handle == 0) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + struct anv_bo new_bo = { + .name = name, + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + ._ccs_size = ccs_size, + .flags = bo_flags, + .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL), + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + .has_implicit_ccs = ccs_size > 0 || device->info->verx10 >= 125, + }; + + if (alloc_flags & ANV_BO_ALLOC_MAPPED) { + VkResult result = anv_device_map_bo(device, &new_bo, 0, size, + 0 /* gem_flags */, &new_bo.map); + if (unlikely(result != VK_SUCCESS)) { + anv_gem_close(device, new_bo.gem_handle); + return result; + } + } + + if (alloc_flags & ANV_BO_ALLOC_SNOOPED) { + assert(alloc_flags & ANV_BO_ALLOC_MAPPED); + /* We don't want to change these defaults if it's going to be shared + * with another process. + */ + assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL)); + + /* Regular objects are created I915_CACHING_CACHED on LLC platforms and + * I915_CACHING_NONE on non-LLC platforms. For many internal state + * objects, we'd rather take the snooping overhead than risk forgetting + * a CLFLUSH somewhere. Userptr objects are always created as + * I915_CACHING_CACHED, which on non-LLC means snooped so there's no + * need to do this there. + */ + if (!device->info->has_llc) { + anv_gem_set_caching(device, new_bo.gem_handle, + I915_CACHING_CACHED); + } + } + + if (anv_bo_is_pinned(&new_bo)) { + VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo, + alloc_flags, + explicit_address); + if (result != VK_SUCCESS) + return result; + } else { + assert(!new_bo.has_client_visible_address); + } + + if (new_bo._ccs_size > 0) { + assert(device->info->has_aux_map); + intel_aux_map_add_mapping(device->aux_map_ctx, + intel_canonical_address(new_bo.offset), + intel_canonical_address(new_bo.offset + new_bo.size), + new_bo.size, 0 /* format_bits */); + } + + assert(new_bo.gem_handle); + + /* If we just got this gem_handle from anv_bo_init_new then we know no one + * else is touching this BO at the moment so we don't need to lock here. + */ + struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle); + *bo = new_bo; + + *bo_out = bo; + + return VK_SUCCESS; +} + +VkResult +anv_device_map_bo(struct anv_device *device, + struct anv_bo *bo, + uint64_t offset, + size_t size, + uint32_t gem_flags, + void **map_out) +{ + assert(!bo->is_wrapper && !bo->from_host_ptr); + assert(size > 0); + + void *map = anv_gem_mmap(device, bo->gem_handle, offset, size, gem_flags); + if (unlikely(map == MAP_FAILED)) + return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m"); + + assert(map != NULL); + + if (map_out) + *map_out = map; + + return VK_SUCCESS; +} + +void +anv_device_unmap_bo(struct anv_device *device, + struct anv_bo *bo, + void *map, size_t map_size) +{ + assert(!bo->is_wrapper && !bo->from_host_ptr); + + anv_gem_munmap(device, map, map_size); +} + +VkResult +anv_device_import_bo_from_host_ptr(struct anv_device *device, + void *host_ptr, uint32_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out) +{ + assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_FIXED_ADDRESS))); + + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) || + (device->physical->has_implicit_ccs && device->info->has_aux_map)); + + struct anv_bo_cache *cache = &device->bo_cache; + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); + assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); + + uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size); + if (!gem_handle) + return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); + + pthread_mutex_lock(&cache->mutex); + + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + if (bo->refcount > 0) { + /* VK_EXT_external_memory_host doesn't require handling importing the + * same pointer twice at the same time, but we don't get in the way. If + * kernel gives us the same gem_handle, only succeed if the flags match. + */ + assert(bo->gem_handle == gem_handle); + if (bo_flags != bo->flags) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "same host pointer imported two different ways"); + } + + if (bo->has_client_visible_address != + ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported with and without buffer " + "device address"); + } + + if (client_address && client_address != intel_48b_address(bo->offset)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported at two different " + "addresses"); + } + + __sync_fetch_and_add(&bo->refcount, 1); + } else { + struct anv_bo new_bo = { + .name = "host-ptr", + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + .map = host_ptr, + .flags = bo_flags, + .is_external = true, + .from_host_ptr = true, + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + }; + + if (anv_bo_is_pinned(&new_bo)) { + VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo, + alloc_flags, + client_address); + if (result != VK_SUCCESS) { + pthread_mutex_unlock(&cache->mutex); + return result; + } + } else { + assert(!new_bo.has_client_visible_address); + } + + *bo = new_bo; + } + + pthread_mutex_unlock(&cache->mutex); + *bo_out = bo; + + return VK_SUCCESS; +} + +VkResult +anv_device_import_bo(struct anv_device *device, + int fd, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out) +{ + assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED | + ANV_BO_ALLOC_FIXED_ADDRESS))); + + assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) || + (device->physical->has_implicit_ccs && device->info->has_aux_map)); + + struct anv_bo_cache *cache = &device->bo_cache; + const uint32_t bo_flags = + anv_bo_alloc_flags_to_bo_flags(device, alloc_flags); + assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS)); + + pthread_mutex_lock(&cache->mutex); + + uint32_t gem_handle = anv_gem_fd_to_handle(device, fd); + if (!gem_handle) { + pthread_mutex_unlock(&cache->mutex); + return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); + } + + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + if (bo->refcount > 0) { + /* We have to be careful how we combine flags so that it makes sense. + * Really, though, if we get to this case and it actually matters, the + * client has imported a BO twice in different ways and they get what + * they have coming. + */ + uint64_t new_flags = 0; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE; + new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC; + new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED; + new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE; + + /* It's theoretically possible for a BO to get imported such that it's + * both pinned and not pinned. The only way this can happen is if it + * gets imported as both a semaphore and a memory object and that would + * be an application error. Just fail out in that case. + */ + if ((bo->flags & EXEC_OBJECT_PINNED) != + (bo_flags & EXEC_OBJECT_PINNED)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported two different ways"); + } + + /* It's also theoretically possible that someone could export a BO from + * one heap and import it into another or to import the same BO into two + * different heaps. If this happens, we could potentially end up both + * allowing and disallowing 48-bit addresses. There's not much we can + * do about it if we're pinning so we just throw an error and hope no + * app is actually that stupid. + */ + if ((new_flags & EXEC_OBJECT_PINNED) && + (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) != + (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported on two different heaps"); + } + + if (bo->has_client_visible_address != + ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported with and without buffer " + "device address"); + } + + if (client_address && client_address != intel_48b_address(bo->offset)) { + pthread_mutex_unlock(&cache->mutex); + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "The same BO was imported at two different " + "addresses"); + } + + bo->flags = new_flags; + + __sync_fetch_and_add(&bo->refcount, 1); + } else { + off_t size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1) { + anv_gem_close(device, gem_handle); + pthread_mutex_unlock(&cache->mutex); + return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); + } + + struct anv_bo new_bo = { + .name = "imported", + .gem_handle = gem_handle, + .refcount = 1, + .offset = -1, + .size = size, + .flags = bo_flags, + .is_external = true, + .has_client_visible_address = + (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0, + }; + + if (anv_bo_is_pinned(&new_bo)) { + assert(new_bo._ccs_size == 0); + VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo, + alloc_flags, + client_address); + if (result != VK_SUCCESS) { + pthread_mutex_unlock(&cache->mutex); + return result; + } + } else { + assert(!new_bo.has_client_visible_address); + } + + *bo = new_bo; + } + + pthread_mutex_unlock(&cache->mutex); + *bo_out = bo; + + return VK_SUCCESS; +} + +VkResult +anv_device_export_bo(struct anv_device *device, + struct anv_bo *bo, int *fd_out) +{ + assert(anv_device_lookup_bo(device, bo->gem_handle) == bo); + + /* This BO must have been flagged external in order for us to be able + * to export it. This is done based on external options passed into + * anv_AllocateMemory. + */ + assert(bo->is_external); + + int fd = anv_gem_handle_to_fd(device, bo->gem_handle); + if (fd < 0) + return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS); + + *fd_out = fd; + + return VK_SUCCESS; +} + +VkResult +anv_device_get_bo_tiling(struct anv_device *device, + struct anv_bo *bo, + enum isl_tiling *tiling_out) +{ + int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle); + if (i915_tiling < 0) { + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "failed to get BO tiling: %m"); + } + + *tiling_out = isl_tiling_from_i915_tiling(i915_tiling); + + return VK_SUCCESS; +} + +VkResult +anv_device_set_bo_tiling(struct anv_device *device, + struct anv_bo *bo, + uint32_t row_pitch_B, + enum isl_tiling tiling) +{ + int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B, + isl_tiling_to_i915_tiling(tiling)); + if (ret) { + return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "failed to set BO tiling: %m"); + } + + return VK_SUCCESS; +} + +static bool +atomic_dec_not_one(uint32_t *counter) +{ + uint32_t old, val; + + val = *counter; + while (1) { + if (val == 1) + return false; + + old = __sync_val_compare_and_swap(counter, val, val - 1); + if (old == val) + return true; + + val = old; + } +} + +void +anv_device_release_bo(struct anv_device *device, + struct anv_bo *bo) +{ + struct anv_bo_cache *cache = &device->bo_cache; + assert(anv_device_lookup_bo(device, bo->gem_handle) == bo); + + /* Try to decrement the counter but don't go below one. If this succeeds + * then the refcount has been decremented and we are not the last + * reference. + */ + if (atomic_dec_not_one(&bo->refcount)) + return; + + pthread_mutex_lock(&cache->mutex); + + /* We are probably the last reference since our attempt to decrement above + * failed. However, we can't actually know until we are inside the mutex. + * Otherwise, someone could import the BO between the decrement and our + * taking the mutex. + */ + if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) { + /* Turns out we're not the last reference. Unlock and bail. */ + pthread_mutex_unlock(&cache->mutex); + return; + } + assert(bo->refcount == 0); + + if (bo->_ccs_size > 0) { + assert(device->physical->has_implicit_ccs); + assert(device->info->has_aux_map); + assert(bo->has_implicit_ccs); + intel_aux_map_unmap_range(device->aux_map_ctx, + intel_canonical_address(bo->offset), + bo->size); + } + + /* Memset the BO just in case. The refcount being zero should be enough to + * prevent someone from assuming the data is valid but it's safer to just + * stomp to zero just in case. We explicitly do this *before* we actually + * close the GEM handle to ensure that if anyone allocates something and + * gets the same GEM handle, the memset has already happen and won't stomp + * all over any data they may write in this BO. + */ + struct anv_bo old_bo = *bo; + + memset(bo, 0, sizeof(*bo)); + + anv_bo_finish(device, &old_bo); + + /* Don't unlock until we've actually closed the BO. The whole point of + * the BO cache is to ensure that we correctly handle races with creating + * and releasing GEM handles and we don't want to let someone import the BO + * again between mutex unlock and closing the GEM handle. + */ + pthread_mutex_unlock(&cache->mutex); +} diff --git a/src/intel/vulkan_hasvk/anv_android.c b/src/intel/vulkan_hasvk/anv_android.c new file mode 100644 index 00000000000..8a17f0a2454 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_android.c @@ -0,0 +1,792 @@ +/* + * Copyright © 2017, Google Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#if ANDROID_API_LEVEL >= 26 +#include +#endif + +#include +#include +#include +#include +#include + +#include "anv_private.h" +#include "vk_common_entrypoints.h" +#include "vk_util.h" + +static int anv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev); +static int anv_hal_close(struct hw_device_t *dev); + +static void UNUSED +static_asserts(void) +{ + STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC); +} + +PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = { + .common = { + .tag = HARDWARE_MODULE_TAG, + .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1, + .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0), + .id = HWVULKAN_HARDWARE_MODULE_ID, + .name = "Intel Vulkan HAL", + .author = "Intel", + .methods = &(hw_module_methods_t) { + .open = anv_hal_open, + }, + }, +}; + +/* If any bits in test_mask are set, then unset them and return true. */ +static inline bool +unmask32(uint32_t *inout_mask, uint32_t test_mask) +{ + uint32_t orig_mask = *inout_mask; + *inout_mask &= ~test_mask; + return *inout_mask != orig_mask; +} + +static int +anv_hal_open(const struct hw_module_t* mod, const char* id, + struct hw_device_t** dev) +{ + assert(mod == &HAL_MODULE_INFO_SYM.common); + assert(strcmp(id, HWVULKAN_DEVICE_0) == 0); + + hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev)); + if (!hal_dev) + return -1; + + *hal_dev = (hwvulkan_device_t) { + .common = { + .tag = HARDWARE_DEVICE_TAG, + .version = HWVULKAN_DEVICE_API_VERSION_0_1, + .module = &HAL_MODULE_INFO_SYM.common, + .close = anv_hal_close, + }, + .EnumerateInstanceExtensionProperties = anv_EnumerateInstanceExtensionProperties, + .CreateInstance = anv_CreateInstance, + .GetInstanceProcAddr = anv_GetInstanceProcAddr, + }; + + *dev = &hal_dev->common; + return 0; +} + +static int +anv_hal_close(struct hw_device_t *dev) +{ + /* hwvulkan.h claims that hw_device_t::close() is never called. */ + return -1; +} + +#if ANDROID_API_LEVEL >= 26 +#include +/* See i915_private_android_types.h in minigbm. */ +#define HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL 0x100 + +enum { + /* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */ + BUFFER_USAGE_CAMERA_MASK = 0x00060000U, +}; + +inline VkFormat +vk_format_from_android(unsigned android_format, unsigned android_usage) +{ + switch (android_format) { + case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM: + return VK_FORMAT_R8G8B8A8_UNORM; + case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM: + case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM: + return VK_FORMAT_R8G8B8_UNORM; + case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM: + return VK_FORMAT_R5G6B5_UNORM_PACK16; + case AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT: + return VK_FORMAT_R16G16B16A16_SFLOAT; + case AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM: + return VK_FORMAT_A2B10G10R10_UNORM_PACK32; + case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420: + case HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL: + return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED: + if (android_usage & BUFFER_USAGE_CAMERA_MASK) + return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM; + else + return VK_FORMAT_R8G8B8_UNORM; + case AHARDWAREBUFFER_FORMAT_BLOB: + default: + return VK_FORMAT_UNDEFINED; + } +} + +static inline unsigned +android_format_from_vk(unsigned vk_format) +{ + switch (vk_format) { + case VK_FORMAT_R8G8B8A8_UNORM: + return AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM; + case VK_FORMAT_R8G8B8_UNORM: + return AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM; + case VK_FORMAT_R5G6B5_UNORM_PACK16: + return AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM; + case VK_FORMAT_R16G16B16A16_SFLOAT: + return AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT; + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + return AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM; + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: +#ifdef HAVE_CROS_GRALLOC + return AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420; +#else + return HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL; +#endif + default: + return AHARDWAREBUFFER_FORMAT_BLOB; + } +} + +static VkFormatFeatureFlags +features2_to_features(VkFormatFeatureFlags2 features2) +{ + return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS; +} + +static VkResult +get_ahw_buffer_format_properties2( + VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties) +{ + ANV_FROM_HANDLE(anv_device, device, device_h); + + /* Get a description of buffer contents . */ + AHardwareBuffer_Desc desc; + AHardwareBuffer_describe(buffer, &desc); + + /* Verify description. */ + uint64_t gpu_usage = + AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE | + AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT | + AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER; + + /* "Buffer must be a valid Android hardware buffer object with at least + * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags." + */ + if (!(desc.usage & (gpu_usage))) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* Fill properties fields based on description. */ + VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties; + + p->format = vk_format_from_android(desc.format, desc.usage); + + const struct anv_format *anv_format = anv_get_format(p->format); + p->externalFormat = (uint64_t) (uintptr_t) anv_format; + + /* Default to OPTIMAL tiling but set to linear in case + * of AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER usage. + */ + VkImageTiling tiling = VK_IMAGE_TILING_OPTIMAL; + + if (desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER) + tiling = VK_IMAGE_TILING_LINEAR; + + p->formatFeatures = + anv_get_image_format_features2(device->info, p->format, anv_format, + tiling, NULL); + + /* "Images can be created with an external format even if the Android hardware + * buffer has a format which has an equivalent Vulkan format to enable + * consistent handling of images from sources that might use either category + * of format. However, all images created with an external format are subject + * to the valid usage requirements associated with external formats, even if + * the Android hardware buffer’s format has a Vulkan equivalent." + * + * "The formatFeatures member *must* include + * VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT and at least one of + * VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT or + * VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT" + */ + p->formatFeatures |= + VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT; + + /* "Implementations may not always be able to determine the color model, + * numerical range, or chroma offsets of the image contents, so the values + * in VkAndroidHardwareBufferFormatPropertiesANDROID are only suggestions. + * Applications should treat these values as sensible defaults to use in + * the absence of more reliable information obtained through some other + * means." + */ + p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY; + p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601; + p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL; + + p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT; + + return VK_SUCCESS; +} + +VkResult +anv_GetAndroidHardwareBufferPropertiesANDROID( + VkDevice device_h, + const struct AHardwareBuffer *buffer, + VkAndroidHardwareBufferPropertiesANDROID *pProperties) +{ + ANV_FROM_HANDLE(anv_device, dev, device_h); + + VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop = + vk_find_struct(pProperties->pNext, + ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID); + /* Fill format properties of an Android hardware buffer. */ + if (format_prop) { + VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = { + .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID, + }; + get_ahw_buffer_format_properties2(device_h, buffer, &format_prop2); + + format_prop->format = format_prop2.format; + format_prop->externalFormat = format_prop2.externalFormat; + format_prop->formatFeatures = + features2_to_features(format_prop2.formatFeatures); + format_prop->samplerYcbcrConversionComponents = + format_prop2.samplerYcbcrConversionComponents; + format_prop->suggestedYcbcrModel = format_prop2.suggestedYcbcrModel; + format_prop->suggestedYcbcrRange = format_prop2.suggestedYcbcrRange; + format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset; + format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset; + } + + VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 = + vk_find_struct(pProperties->pNext, + ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID); + if (format_prop2) + get_ahw_buffer_format_properties2(device_h, buffer, format_prop2); + + /* NOTE - We support buffers with only one handle but do not error on + * multiple handle case. Reason is that we want to support YUV formats + * where we have many logical planes but they all point to the same + * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM. + */ + const native_handle_t *handle = + AHardwareBuffer_getNativeHandle(buffer); + int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1; + if (dma_buf < 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + /* All memory types. */ + uint32_t memory_types = (1ull << dev->physical->memory.type_count) - 1; + + pProperties->allocationSize = lseek(dma_buf, 0, SEEK_END); + pProperties->memoryTypeBits = memory_types; + + return VK_SUCCESS; +} + +VkResult +anv_GetMemoryAndroidHardwareBufferANDROID( + VkDevice device_h, + const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo, + struct AHardwareBuffer **pBuffer) +{ + ANV_FROM_HANDLE(anv_device_memory, mem, pInfo->memory); + + /* Some quotes from Vulkan spec: + * + * "If the device memory was created by importing an Android hardware + * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same + * Android hardware buffer object." + * + * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must + * have been included in VkExportMemoryAllocateInfo::handleTypes when + * memory was created." + */ + if (mem->ahw) { + *pBuffer = mem->ahw; + /* Increase refcount. */ + AHardwareBuffer_acquire(mem->ahw); + return VK_SUCCESS; + } + + return VK_ERROR_OUT_OF_HOST_MEMORY; +} + +#endif + +/* Construct ahw usage mask from image usage bits, see + * 'AHardwareBuffer Usage Equivalence' in Vulkan spec. + */ +uint64_t +anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create, + const VkImageUsageFlags vk_usage) +{ + uint64_t ahw_usage = 0; +#if ANDROID_API_LEVEL >= 26 + if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT) + ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; + + if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) + ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; + + if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) + ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT; + + if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) + ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP; + + if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT) + ahw_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT; + + /* No usage bits set - set at least one GPU usage. */ + if (ahw_usage == 0) + ahw_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE; +#endif + return ahw_usage; +} + +/* + * Called from anv_AllocateMemory when import AHardwareBuffer. + */ +VkResult +anv_import_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkImportAndroidHardwareBufferInfoANDROID *info) +{ +#if ANDROID_API_LEVEL >= 26 + ANV_FROM_HANDLE(anv_device, device, device_h); + + /* Import from AHardwareBuffer to anv_device_memory. */ + const native_handle_t *handle = + AHardwareBuffer_getNativeHandle(info->buffer); + + /* NOTE - We support buffers with only one handle but do not error on + * multiple handle case. Reason is that we want to support YUV formats + * where we have many logical planes but they all point to the same + * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM. + */ + int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1; + if (dma_buf < 0) + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + + VkResult result = anv_device_import_bo(device, dma_buf, 0, + 0 /* client_address */, + &mem->bo); + assert(result == VK_SUCCESS); + + /* "If the vkAllocateMemory command succeeds, the implementation must + * acquire a reference to the imported hardware buffer, which it must + * release when the device memory object is freed. If the command fails, + * the implementation must not retain a reference." + */ + AHardwareBuffer_acquire(info->buffer); + mem->ahw = info->buffer; + + return VK_SUCCESS; +#else + return VK_ERROR_EXTENSION_NOT_PRESENT; +#endif +} + +VkResult +anv_create_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkMemoryAllocateInfo *pAllocateInfo) +{ +#if ANDROID_API_LEVEL >= 26 + const VkMemoryDedicatedAllocateInfo *dedicated_info = + vk_find_struct_const(pAllocateInfo->pNext, + MEMORY_DEDICATED_ALLOCATE_INFO); + + uint32_t w = 0; + uint32_t h = 1; + uint32_t layers = 1; + uint32_t format = 0; + uint64_t usage = 0; + + /* If caller passed dedicated information. */ + if (dedicated_info && dedicated_info->image) { + ANV_FROM_HANDLE(anv_image, image, dedicated_info->image); + w = image->vk.extent.width; + h = image->vk.extent.height; + layers = image->vk.array_layers; + format = android_format_from_vk(image->vk.format); + usage = anv_ahw_usage_from_vk_usage(image->vk.create_flags, image->vk.usage); + } else if (dedicated_info && dedicated_info->buffer) { + ANV_FROM_HANDLE(anv_buffer, buffer, dedicated_info->buffer); + w = buffer->vk.size; + format = AHARDWAREBUFFER_FORMAT_BLOB; + usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | + AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN; + } else { + w = pAllocateInfo->allocationSize; + format = AHARDWAREBUFFER_FORMAT_BLOB; + usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | + AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN; + } + + struct AHardwareBuffer *ahw = NULL; + struct AHardwareBuffer_Desc desc = { + .width = w, + .height = h, + .layers = layers, + .format = format, + .usage = usage, + }; + + if (AHardwareBuffer_allocate(&desc, &ahw) != 0) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + const VkImportAndroidHardwareBufferInfoANDROID import_info = { + .buffer = ahw, + }; + VkResult result = anv_import_ahw_memory(device_h, mem, &import_info); + + /* Release a reference to avoid leak for AHB allocation. */ + AHardwareBuffer_release(ahw); + + return result; +#else + return VK_ERROR_EXTENSION_NOT_PRESENT; +#endif + +} + +VkResult +anv_image_init_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkImageCreateInfo *base_info, + const VkNativeBufferANDROID *gralloc_info) +{ + struct anv_bo *bo = NULL; + VkResult result; + + struct anv_image_create_info anv_info = { + .vk_info = base_info, + .isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT, + }; + + if (gralloc_info->handle->numFds != 1) { + return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "VkNativeBufferANDROID::handle::numFds is %d, " + "expected 1", gralloc_info->handle->numFds); + } + + /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf + * must exceed that of the gralloc handle, and we do not own the gralloc + * handle. + */ + int dma_buf = gralloc_info->handle->data[0]; + + /* We need to set the WRITE flag on window system buffers so that GEM will + * know we're writing to them and synchronize uses on other rings (for + * example, if the display server uses the blitter ring). + * + * If this function fails and if the imported bo was resident in the cache, + * we should avoid updating the bo's flags. Therefore, we defer updating + * the flags until success is certain. + * + */ + result = anv_device_import_bo(device, dma_buf, + ANV_BO_ALLOC_IMPLICIT_SYNC | + ANV_BO_ALLOC_IMPLICIT_WRITE, + 0 /* client_address */, + &bo); + if (result != VK_SUCCESS) { + return vk_errorf(device, result, + "failed to import dma-buf from VkNativeBufferANDROID"); + } + + enum isl_tiling tiling; + result = anv_device_get_bo_tiling(device, bo, &tiling); + if (result != VK_SUCCESS) { + return vk_errorf(device, result, + "failed to get tiling from VkNativeBufferANDROID"); + } + anv_info.isl_tiling_flags = 1u << tiling; + + enum isl_format format = anv_get_isl_format(device->info, + base_info->format, + VK_IMAGE_ASPECT_COLOR_BIT, + base_info->tiling); + assert(format != ISL_FORMAT_UNSUPPORTED); + + result = anv_image_init(device, image, &anv_info); + if (result != VK_SUCCESS) + goto fail_init; + + VkMemoryRequirements2 mem_reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + }; + + anv_image_get_memory_requirements(device, image, image->vk.aspects, + &mem_reqs); + + VkDeviceSize aligned_image_size = + align_u64(mem_reqs.memoryRequirements.size, + mem_reqs.memoryRequirements.alignment); + + if (bo->size < aligned_image_size) { + result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "dma-buf from VkNativeBufferANDROID is too small for " + "VkImage: %"PRIu64"B < %"PRIu64"B", + bo->size, aligned_image_size); + goto fail_size; + } + + assert(!image->disjoint); + assert(image->n_planes == 1); + assert(image->planes[0].primary_surface.memory_range.binding == + ANV_IMAGE_MEMORY_BINDING_MAIN); + assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo == NULL); + assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.offset == 0); + image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo; + image->from_gralloc = true; + + return VK_SUCCESS; + + fail_size: + anv_image_finish(image); + fail_init: + anv_device_release_bo(device, bo); + + return result; +} + +VkResult +anv_image_bind_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkNativeBufferANDROID *gralloc_info) +{ + /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf + * must exceed that of the gralloc handle, and we do not own the gralloc + * handle. + */ + int dma_buf = gralloc_info->handle->data[0]; + + /* We need to set the WRITE flag on window system buffers so that GEM will + * know we're writing to them and synchronize uses on other rings (for + * example, if the display server uses the blitter ring). + * + * If this function fails and if the imported bo was resident in the cache, + * we should avoid updating the bo's flags. Therefore, we defer updating + * the flags until success is certain. + * + */ + struct anv_bo *bo = NULL; + VkResult result = anv_device_import_bo(device, dma_buf, + ANV_BO_ALLOC_IMPLICIT_SYNC | + ANV_BO_ALLOC_IMPLICIT_WRITE, + 0 /* client_address */, + &bo); + if (result != VK_SUCCESS) { + return vk_errorf(device, result, + "failed to import dma-buf from VkNativeBufferANDROID"); + } + + uint64_t img_size = image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.size; + if (img_size < bo->size) { + result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "dma-buf from VkNativeBufferANDROID is too small for " + "VkImage: %"PRIu64"B < %"PRIu64"B", + bo->size, img_size); + anv_device_release_bo(device, bo); + return result; + } + + assert(!image->disjoint); + assert(image->n_planes == 1); + assert(image->planes[0].primary_surface.memory_range.binding == + ANV_IMAGE_MEMORY_BINDING_MAIN); + assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo == NULL); + assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.offset == 0); + image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo; + image->from_gralloc = true; + + return VK_SUCCESS; +} + +static VkResult +format_supported_with_usage(VkDevice device_h, VkFormat format, + VkImageUsageFlags imageUsage) +{ + ANV_FROM_HANDLE(anv_device, device, device_h); + VkPhysicalDevice phys_dev_h = anv_physical_device_to_handle(device->physical); + VkResult result; + + const VkPhysicalDeviceImageFormatInfo2 image_format_info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .format = format, + .type = VK_IMAGE_TYPE_2D, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = imageUsage, + }; + + VkImageFormatProperties2 image_format_props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + }; + + /* Check that requested format and usage are supported. */ + result = anv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h, + &image_format_info, &image_format_props); + if (result != VK_SUCCESS) { + return vk_errorf(device, result, + "anv_GetPhysicalDeviceImageFormatProperties2 failed " + "inside %s", __func__); + } + return VK_SUCCESS; +} + + +static VkResult +setup_gralloc0_usage(struct anv_device *device, VkFormat format, + VkImageUsageFlags imageUsage, int *grallocUsage) +{ + /* WARNING: Android's libvulkan.so hardcodes the VkImageUsageFlags + * returned to applications via VkSurfaceCapabilitiesKHR::supportedUsageFlags. + * The relevant code in libvulkan/swapchain.cpp contains this fun comment: + * + * TODO(jessehall): I think these are right, but haven't thought hard + * about it. Do we need to query the driver for support of any of + * these? + * + * Any disagreement between this function and the hardcoded + * VkSurfaceCapabilitiesKHR:supportedUsageFlags causes tests + * dEQP-VK.wsi.android.swapchain.*.image_usage to fail. + */ + + if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)) + *grallocUsage |= GRALLOC_USAGE_HW_RENDER; + + if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_STORAGE_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) + *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE; + + /* All VkImageUsageFlags not explicitly checked here are unsupported for + * gralloc swapchains. + */ + if (imageUsage != 0) { + return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "unsupported VkImageUsageFlags(0x%x) for gralloc " + "swapchain", imageUsage); + } + + /* The below formats support GRALLOC_USAGE_HW_FB (that is, display + * scanout). This short list of formats is univserally supported on Intel + * but is incomplete. The full set of supported formats is dependent on + * kernel and hardware. + * + * FINISHME: Advertise all display-supported formats. + */ + switch (format) { + case VK_FORMAT_B8G8R8A8_UNORM: + case VK_FORMAT_R5G6B5_UNORM_PACK16: + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_R8G8B8A8_SRGB: + *grallocUsage |= GRALLOC_USAGE_HW_FB | + GRALLOC_USAGE_HW_COMPOSER | + GRALLOC_USAGE_EXTERNAL_DISP; + break; + default: + mesa_logw("%s: unsupported format=%d", __func__, format); + } + + if (*grallocUsage == 0) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + + return VK_SUCCESS; +} + +#if ANDROID_API_LEVEL >= 26 +VkResult anv_GetSwapchainGrallocUsage2ANDROID( + VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage, + VkSwapchainImageUsageFlagsANDROID swapchainImageUsage, + uint64_t* grallocConsumerUsage, + uint64_t* grallocProducerUsage) +{ + ANV_FROM_HANDLE(anv_device, device, device_h); + VkResult result; + + *grallocConsumerUsage = 0; + *grallocProducerUsage = 0; + mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage); + + result = format_supported_with_usage(device_h, format, imageUsage); + if (result != VK_SUCCESS) + return result; + + int32_t grallocUsage = 0; + result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage); + if (result != VK_SUCCESS) + return result; + + /* Setup gralloc1 usage flags from gralloc0 flags. */ + + if (grallocUsage & GRALLOC_USAGE_HW_RENDER) { + *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_CLIENT_TARGET; + } + + if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) { + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE; + } + + if (grallocUsage & (GRALLOC_USAGE_HW_FB | + GRALLOC_USAGE_HW_COMPOSER | + GRALLOC_USAGE_EXTERNAL_DISP)) { + *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET; + *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER; + } + + return VK_SUCCESS; +} +#endif + +VkResult anv_GetSwapchainGrallocUsageANDROID( + VkDevice device_h, + VkFormat format, + VkImageUsageFlags imageUsage, + int* grallocUsage) +{ + ANV_FROM_HANDLE(anv_device, device, device_h); + VkResult result; + + *grallocUsage = 0; + mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage); + + result = format_supported_with_usage(device_h, format, imageUsage); + if (result != VK_SUCCESS) + return result; + + return setup_gralloc0_usage(device, format, imageUsage, grallocUsage); +} diff --git a/src/intel/vulkan_hasvk/anv_android.h b/src/intel/vulkan_hasvk/anv_android.h new file mode 100644 index 00000000000..4490d3b2437 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_android.h @@ -0,0 +1,57 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ANV_ANDROID_H +#define ANV_ANDROID_H + +#if defined(ANDROID) && ANDROID_API_LEVEL >= 26 +#include +#endif +#include +#include +#include + +struct anv_device_memory; +struct anv_device; +struct anv_image; + +VkResult anv_image_init_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkImageCreateInfo *base_info, + const VkNativeBufferANDROID *gralloc_info); + +VkResult anv_image_bind_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkNativeBufferANDROID *gralloc_info); + +uint64_t anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create, + const VkImageUsageFlags vk_usage); + +VkResult anv_import_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkImportAndroidHardwareBufferInfoANDROID *info); + +VkResult anv_create_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkMemoryAllocateInfo *pAllocateInfo); +#endif /* ANV_ANDROID_H */ diff --git a/src/intel/vulkan_hasvk/anv_android_stubs.c b/src/intel/vulkan_hasvk/anv_android_stubs.c new file mode 100644 index 00000000000..d5bc11949ab --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_android_stubs.c @@ -0,0 +1,63 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_android.h" + +VkResult +anv_image_init_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkImageCreateInfo *base_info, + const VkNativeBufferANDROID *gralloc_info) +{ + return VK_ERROR_EXTENSION_NOT_PRESENT; +} + +VkResult anv_image_bind_from_gralloc(struct anv_device *device, + struct anv_image *image, + const VkNativeBufferANDROID *gralloc_info) +{ + return VK_ERROR_EXTENSION_NOT_PRESENT; +} + +uint64_t +anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create, + const VkImageUsageFlags vk_usage) +{ + return 0; +} + +VkResult +anv_import_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkImportAndroidHardwareBufferInfoANDROID *info) +{ + return VK_ERROR_EXTENSION_NOT_PRESENT; +} + +VkResult +anv_create_ahw_memory(VkDevice device_h, + struct anv_device_memory *mem, + const VkMemoryAllocateInfo *pAllocateInfo) +{ + return VK_ERROR_EXTENSION_NOT_PRESENT; +} diff --git a/src/intel/vulkan_hasvk/anv_batch_chain.c b/src/intel/vulkan_hasvk/anv_batch_chain.c new file mode 100644 index 00000000000..459747e0a29 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_batch_chain.c @@ -0,0 +1,2477 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#include "anv_private.h" +#include "anv_measure.h" + +#include "genxml/gen8_pack.h" +#include "genxml/genX_bits.h" +#include "perf/intel_perf.h" + +#include "util/debug.h" +#include "util/perf/u_trace.h" + +/** \file anv_batch_chain.c + * + * This file contains functions related to anv_cmd_buffer as a data + * structure. This involves everything required to create and destroy + * the actual batch buffers as well as link them together and handle + * relocations and surface state. It specifically does *not* contain any + * handling of actual vkCmd calls beyond vkCmdExecuteCommands. + */ + +/*-----------------------------------------------------------------------* + * Functions related to anv_reloc_list + *-----------------------------------------------------------------------*/ + +VkResult +anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc) +{ + memset(list, 0, sizeof(*list)); + return VK_SUCCESS; +} + +static VkResult +anv_reloc_list_init_clone(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + const struct anv_reloc_list *other_list) +{ + list->num_relocs = other_list->num_relocs; + list->array_length = other_list->array_length; + + if (list->num_relocs > 0) { + list->relocs = + vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->relocs == NULL) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + + list->reloc_bos = + vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (list->reloc_bos == NULL) { + vk_free(alloc, list->relocs); + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + memcpy(list->relocs, other_list->relocs, + list->array_length * sizeof(*list->relocs)); + memcpy(list->reloc_bos, other_list->reloc_bos, + list->array_length * sizeof(*list->reloc_bos)); + } else { + list->relocs = NULL; + list->reloc_bos = NULL; + } + + list->dep_words = other_list->dep_words; + + if (list->dep_words > 0) { + list->deps = + vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + memcpy(list->deps, other_list->deps, + list->dep_words * sizeof(BITSET_WORD)); + } else { + list->deps = NULL; + } + + return VK_SUCCESS; +} + +void +anv_reloc_list_finish(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc) +{ + vk_free(alloc, list->relocs); + vk_free(alloc, list->reloc_bos); + vk_free(alloc, list->deps); +} + +static VkResult +anv_reloc_list_grow(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + size_t num_additional_relocs) +{ + if (list->num_relocs + num_additional_relocs <= list->array_length) + return VK_SUCCESS; + + size_t new_length = MAX2(16, list->array_length * 2); + while (new_length < list->num_relocs + num_additional_relocs) + new_length *= 2; + + struct drm_i915_gem_relocation_entry *new_relocs = + vk_realloc(alloc, list->relocs, + new_length * sizeof(*list->relocs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_relocs == NULL) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + list->relocs = new_relocs; + + struct anv_bo **new_reloc_bos = + vk_realloc(alloc, list->reloc_bos, + new_length * sizeof(*list->reloc_bos), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_reloc_bos == NULL) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + list->reloc_bos = new_reloc_bos; + + list->array_length = new_length; + + return VK_SUCCESS; +} + +static VkResult +anv_reloc_list_grow_deps(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t min_num_words) +{ + if (min_num_words <= list->dep_words) + return VK_SUCCESS; + + uint32_t new_length = MAX2(32, list->dep_words * 2); + while (new_length < min_num_words) + new_length *= 2; + + BITSET_WORD *new_deps = + vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (new_deps == NULL) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + list->deps = new_deps; + + /* Zero out the new data */ + memset(list->deps + list->dep_words, 0, + (new_length - list->dep_words) * sizeof(BITSET_WORD)); + list->dep_words = new_length; + + return VK_SUCCESS; +} + +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +VkResult +anv_reloc_list_add_bo(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + struct anv_bo *target_bo) +{ + assert(!target_bo->is_wrapper); + assert(anv_bo_is_pinned(target_bo)); + + uint32_t idx = target_bo->gem_handle; + VkResult result = anv_reloc_list_grow_deps(list, alloc, + (idx / BITSET_WORDBITS) + 1); + if (unlikely(result != VK_SUCCESS)) + return result; + + BITSET_SET(list->deps, idx); + + return VK_SUCCESS; +} + +VkResult +anv_reloc_list_add(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t offset, struct anv_bo *target_bo, uint32_t delta, + uint64_t *address_u64_out) +{ + struct drm_i915_gem_relocation_entry *entry; + int index; + + struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo); + uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset); + if (address_u64_out) + *address_u64_out = target_bo_offset + delta; + + assert(unwrapped_target_bo->gem_handle > 0); + assert(unwrapped_target_bo->refcount > 0); + + if (anv_bo_is_pinned(unwrapped_target_bo)) + return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo); + + VkResult result = anv_reloc_list_grow(list, alloc, 1); + if (result != VK_SUCCESS) + return result; + + /* XXX: Can we use I915_EXEC_HANDLE_LUT? */ + index = list->num_relocs++; + list->reloc_bos[index] = target_bo; + entry = &list->relocs[index]; + entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */ + entry->delta = delta; + entry->offset = offset; + entry->presumed_offset = target_bo_offset; + entry->read_domains = 0; + entry->write_domain = 0; + VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry))); + + return VK_SUCCESS; +} + +static void +anv_reloc_list_clear(struct anv_reloc_list *list) +{ + list->num_relocs = 0; + if (list->dep_words > 0) + memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD)); +} + +static VkResult +anv_reloc_list_append(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + struct anv_reloc_list *other, uint32_t offset) +{ + VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs); + if (result != VK_SUCCESS) + return result; + + if (other->num_relocs > 0) { + memcpy(&list->relocs[list->num_relocs], &other->relocs[0], + other->num_relocs * sizeof(other->relocs[0])); + memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0], + other->num_relocs * sizeof(other->reloc_bos[0])); + + for (uint32_t i = 0; i < other->num_relocs; i++) + list->relocs[i + list->num_relocs].offset += offset; + + list->num_relocs += other->num_relocs; + } + + anv_reloc_list_grow_deps(list, alloc, other->dep_words); + for (uint32_t w = 0; w < other->dep_words; w++) + list->deps[w] |= other->deps[w]; + + return VK_SUCCESS; +} + +/*-----------------------------------------------------------------------* + * Functions related to anv_batch + *-----------------------------------------------------------------------*/ + +void * +anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords) +{ + if (batch->next + num_dwords * 4 > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return NULL; + } + } + + void *p = batch->next; + + batch->next += num_dwords * 4; + assert(batch->next <= batch->end); + + return p; +} + +struct anv_address +anv_batch_address(struct anv_batch *batch, void *batch_location) +{ + assert(batch->start <= batch_location); + + /* Allow a jump at the current location of the batch. */ + assert(batch->next >= batch_location); + + return anv_address_add(batch->start_addr, batch_location - batch->start); +} + +void +anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other) +{ + uint32_t size, offset; + + size = other->next - other->start; + assert(size % 4 == 0); + + if (batch->next + size > batch->end) { + VkResult result = batch->extend_cb(batch, batch->user_data); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } + } + + assert(batch->next + size <= batch->end); + + VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size)); + memcpy(batch->next, other->start, size); + + offset = batch->next - batch->start; + VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc, + other->relocs, offset); + if (result != VK_SUCCESS) { + anv_batch_set_error(batch, result); + return; + } + + batch->next += size; +} + +/*-----------------------------------------------------------------------* + * Functions related to anv_batch_bo + *-----------------------------------------------------------------------*/ + +static VkResult +anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer, + uint32_t size, + struct anv_batch_bo **bbo_out) +{ + VkResult result; + + struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (bbo == NULL) + return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + size, &bbo->bo); + if (result != VK_SUCCESS) + goto fail_alloc; + + result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc); + if (result != VK_SUCCESS) + goto fail_bo_alloc; + + *bbo_out = bbo; + + return VK_SUCCESS; + + fail_bo_alloc: + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); + fail_alloc: + vk_free(&cmd_buffer->vk.pool->alloc, bbo); + + return result; +} + +static VkResult +anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, + const struct anv_batch_bo *other_bbo, + struct anv_batch_bo **bbo_out) +{ + VkResult result; + + struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (bbo == NULL) + return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + other_bbo->bo->size, &bbo->bo); + if (result != VK_SUCCESS) + goto fail_alloc; + + result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->vk.pool->alloc, + &other_bbo->relocs); + if (result != VK_SUCCESS) + goto fail_bo_alloc; + + bbo->length = other_bbo->length; + memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length); + *bbo_out = bbo; + + return VK_SUCCESS; + + fail_bo_alloc: + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); + fail_alloc: + vk_free(&cmd_buffer->vk.pool->alloc, bbo); + + return result; +} + +static void +anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, + size_t batch_padding) +{ + anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, }, + bbo->bo->map, bbo->bo->size - batch_padding); + batch->relocs = &bbo->relocs; + anv_reloc_list_clear(&bbo->relocs); +} + +static void +anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch, + size_t batch_padding) +{ + batch->start_addr = (struct anv_address) { .bo = bbo->bo, }; + batch->start = bbo->bo->map; + batch->next = bbo->bo->map + bbo->length; + batch->end = bbo->bo->map + bbo->bo->size - batch_padding; + batch->relocs = &bbo->relocs; +} + +static void +anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch) +{ + assert(batch->start == bbo->bo->map); + bbo->length = batch->next - batch->start; + VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length)); +} + +static VkResult +anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo, + struct anv_batch *batch, size_t additional, + size_t batch_padding) +{ + assert(batch->start == bbo->bo->map); + bbo->length = batch->next - batch->start; + + size_t new_size = bbo->bo->size; + while (new_size <= bbo->length + additional + batch_padding) + new_size *= 2; + + if (new_size == bbo->bo->size) + return VK_SUCCESS; + + struct anv_bo *new_bo; + VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool, + new_size, &new_bo); + if (result != VK_SUCCESS) + return result; + + memcpy(new_bo->map, bbo->bo->map, bbo->length); + + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); + + bbo->bo = new_bo; + anv_batch_bo_continue(bbo, batch, batch_padding); + + return VK_SUCCESS; +} + +static void +anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer, + struct anv_batch_bo *prev_bbo, + struct anv_batch_bo *next_bbo, + uint32_t next_bbo_offset) +{ + const uint32_t bb_start_offset = + prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4; + ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset; + + /* Make sure we're looking at a MI_BATCH_BUFFER_START */ + assert(((*bb_start >> 29) & 0x07) == 0); + assert(((*bb_start >> 23) & 0x3f) == 49); + + if (anv_use_relocations(cmd_buffer->device->physical)) { + uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1; + assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4); + + prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo; + prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset; + + /* Use a bogus presumed offset to force a relocation */ + prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1; + } else { + assert(anv_bo_is_pinned(prev_bbo->bo)); + assert(anv_bo_is_pinned(next_bbo->bo)); + + write_reloc(cmd_buffer->device, + prev_bbo->bo->map + bb_start_offset + 4, + next_bbo->bo->offset + next_bbo_offset, true); + } +} + +static void +anv_batch_bo_destroy(struct anv_batch_bo *bbo, + struct anv_cmd_buffer *cmd_buffer) +{ + anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->vk.pool->alloc); + anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo); + vk_free(&cmd_buffer->vk.pool->alloc, bbo); +} + +static VkResult +anv_batch_bo_list_clone(const struct list_head *list, + struct anv_cmd_buffer *cmd_buffer, + struct list_head *new_list) +{ + VkResult result = VK_SUCCESS; + + list_inithead(new_list); + + struct anv_batch_bo *prev_bbo = NULL; + list_for_each_entry(struct anv_batch_bo, bbo, list, link) { + struct anv_batch_bo *new_bbo = NULL; + result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo); + if (result != VK_SUCCESS) + break; + list_addtail(&new_bbo->link, new_list); + + if (prev_bbo) + anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0); + + prev_bbo = new_bbo; + } + + if (result != VK_SUCCESS) { + list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) { + list_del(&bbo->link); + anv_batch_bo_destroy(bbo, cmd_buffer); + } + } + + return result; +} + +/*-----------------------------------------------------------------------* + * Functions related to anv_batch_bo + *-----------------------------------------------------------------------*/ + +static struct anv_batch_bo * +anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer) +{ + return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link); +} + +struct anv_address +anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device); + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); + return (struct anv_address) { + .bo = pool->block_pool.bo, + .offset = bt_block->offset - pool->start_offset, + }; +} + +static void +emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer, + struct anv_bo *bo, uint32_t offset) +{ + /* In gfx8+ the address field grew to two dwords to accommodate 48 bit + * offsets. The high 16 bits are in the last dword, so we can use the gfx8 + * version in either case, as long as we set the instruction length in the + * header accordingly. This means that we always emit three dwords here + * and all the padding and adjustment we do in this file works for all + * gens. + */ + +#define GFX7_MI_BATCH_BUFFER_START_length 2 +#define GFX7_MI_BATCH_BUFFER_START_length_bias 2 + + const uint32_t gfx7_length = + GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias; + const uint32_t gfx8_length = + GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias; + + anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) { + bbs.DWordLength = cmd_buffer->device->info->ver < 8 ? + gfx7_length : gfx8_length; + bbs.SecondLevelBatchBuffer = Firstlevelbatch; + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = (struct anv_address) { bo, offset }; + } +} + +static void +cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer, + struct anv_batch_bo *bbo) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_batch_bo *current_bbo = + anv_cmd_buffer_current_batch_bo(cmd_buffer); + + /* We set the end of the batch a little short so we would be sure we + * have room for the chaining command. Since we're about to emit the + * chaining command, let's set it back where it should go. + */ + batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4; + assert(batch->end == current_bbo->bo->map + current_bbo->bo->size); + + emit_batch_buffer_start(cmd_buffer, bbo->bo, 0); + + anv_batch_bo_finish(current_bbo, batch); +} + +static void +anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from, + struct anv_cmd_buffer *cmd_buffer_to) +{ + assert(!anv_use_relocations(cmd_buffer_from->device->physical)); + + uint32_t *bb_start = cmd_buffer_from->batch_end; + + struct anv_batch_bo *last_bbo = + list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link); + struct anv_batch_bo *first_bbo = + list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link); + + struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = { + __anv_cmd_header(GFX8_MI_BATCH_BUFFER_START), + .SecondLevelBatchBuffer = Firstlevelbatch, + .AddressSpaceIndicator = ASI_PPGTT, + .BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 }, + }; + struct anv_batch local_batch = { + .start = last_bbo->bo->map, + .end = last_bbo->bo->map + last_bbo->bo->size, + .relocs = &last_bbo->relocs, + .alloc = &cmd_buffer_from->vk.pool->alloc, + }; + + __anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start); + + last_bbo->chained = true; +} + +static void +anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer) +{ + assert(!anv_use_relocations(cmd_buffer->device->physical)); + + struct anv_batch_bo *last_bbo = + list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); + last_bbo->chained = false; + + uint32_t *batch = cmd_buffer->batch_end; + anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END, + __anv_cmd_header(GFX8_MI_BATCH_BUFFER_END)); +} + +static VkResult +anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data) +{ + struct anv_cmd_buffer *cmd_buffer = _data; + struct anv_batch_bo *new_bbo = NULL; + /* Cap reallocation to chunk. */ + uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size, + ANV_MAX_CMD_BUFFER_BATCH_SIZE); + + VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo); + if (result != VK_SUCCESS) + return result; + + cmd_buffer->total_batch_size += alloc_size; + + struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos); + if (seen_bbo == NULL) { + anv_batch_bo_destroy(new_bbo, cmd_buffer); + return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + } + *seen_bbo = new_bbo; + + cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo); + + list_addtail(&new_bbo->link, &cmd_buffer->batch_bos); + + anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4); + + return VK_SUCCESS; +} + +static VkResult +anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data) +{ + struct anv_cmd_buffer *cmd_buffer = _data; + struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); + + anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096, + GFX8_MI_BATCH_BUFFER_START_length * 4); + + return VK_SUCCESS; +} + +/** Allocate a binding table + * + * This function allocates a binding table. This is a bit more complicated + * than one would think due to a combination of Vulkan driver design and some + * unfortunate hardware restrictions. + * + * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for + * the binding table pointer which means that all binding tables need to live + * in the bottom 64k of surface state base address. The way the GL driver has + * classically dealt with this restriction is to emit all surface states + * on-the-fly into the batch and have a batch buffer smaller than 64k. This + * isn't really an option in Vulkan for a couple of reasons: + * + * 1) In Vulkan, we have growing (or chaining) batches so surface states have + * to live in their own buffer and we have to be able to re-emit + * STATE_BASE_ADDRESS as needed which requires a full pipeline stall. In + * order to avoid emitting STATE_BASE_ADDRESS any more often than needed + * (it's not that hard to hit 64k of just binding tables), we allocate + * surface state objects up-front when VkImageView is created. In order + * for this to work, surface state objects need to be allocated from a + * global buffer. + * + * 2) We tried to design the surface state system in such a way that it's + * already ready for bindless texturing. The way bindless texturing works + * on our hardware is that you have a big pool of surface state objects + * (with its own state base address) and the bindless handles are simply + * offsets into that pool. With the architecture we chose, we already + * have that pool and it's exactly the same pool that we use for regular + * surface states so we should already be ready for bindless. + * + * 3) For render targets, we need to be able to fill out the surface states + * later in vkBeginRenderPass so that we can assign clear colors + * correctly. One way to do this would be to just create the surface + * state data and then repeatedly copy it into the surface state BO every + * time we have to re-emit STATE_BASE_ADDRESS. While this works, it's + * rather annoying and just being able to allocate them up-front and + * re-use them for the entire render pass. + * + * While none of these are technically blockers for emitting state on the fly + * like we do in GL, the ability to have a single surface state pool is + * simplifies things greatly. Unfortunately, it comes at a cost... + * + * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't + * place the binding tables just anywhere in surface state base address. + * Because 64k isn't a whole lot of space, we can't simply restrict the + * surface state buffer to 64k, we have to be more clever. The solution we've + * chosen is to have a block pool with a maximum size of 2G that starts at + * zero and grows in both directions. All surface states are allocated from + * the top of the pool (positive offsets) and we allocate blocks (< 64k) of + * binding tables from the bottom of the pool (negative offsets). Every time + * we allocate a new binding table block, we set surface state base address to + * point to the bottom of the binding table block. This way all of the + * binding tables in the block are in the bottom 64k of surface state base + * address. When we fill out the binding table, we add the distance between + * the bottom of our binding table block and zero of the block pool to the + * surface state offsets so that they are correct relative to out new surface + * state base address at the bottom of the binding table block. + * + * \see adjust_relocations_from_block_pool() + * \see adjust_relocations_too_block_pool() + * + * \param[in] entries The number of surface state entries the binding + * table should be able to hold. + * + * \param[out] state_offset The offset surface surface state base address + * where the surface states live. This must be + * added to the surface state offset when it is + * written into the binding table entry. + * + * \return An anv_state representing the binding table + */ +struct anv_state +anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, + uint32_t entries, uint32_t *state_offset) +{ + struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states); + + uint32_t bt_size = align_u32(entries * 4, 32); + + struct anv_state state = cmd_buffer->bt_next; + if (bt_size > state.alloc_size) + return (struct anv_state) { 0 }; + + state.alloc_size = bt_size; + cmd_buffer->bt_next.offset += bt_size; + cmd_buffer->bt_next.map += bt_size; + cmd_buffer->bt_next.alloc_size -= bt_size; + + if (cmd_buffer->device->info->verx10 >= 125) { + /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding + * table address independently from surface state base address. We no + * longer need any sort of offsetting. + */ + *state_offset = 0; + } else { + assert(bt_block->offset < 0); + *state_offset = -bt_block->offset; + } + + return state; +} + +struct anv_state +anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer) +{ + struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + return anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); +} + +struct anv_state +anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, + uint32_t size, uint32_t alignment) +{ + return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, + size, alignment); +} + +VkResult +anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states); + if (bt_block == NULL) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); + return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device); + + /* The bt_next state is a rolling state (we update it as we suballocate + * from it) which is relative to the start of the binding table block. + */ + cmd_buffer->bt_next = *bt_block; + cmd_buffer->bt_next.offset = 0; + + return VK_SUCCESS; +} + +VkResult +anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_batch_bo *batch_bo = NULL; + VkResult result; + + list_inithead(&cmd_buffer->batch_bos); + + cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE; + + result = anv_batch_bo_create(cmd_buffer, + cmd_buffer->total_batch_size, + &batch_bo); + if (result != VK_SUCCESS) + return result; + + list_addtail(&batch_bo->link, &cmd_buffer->batch_bos); + + cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc; + cmd_buffer->batch.user_data = cmd_buffer; + + if (cmd_buffer->device->can_chain_batches) { + cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch; + } else { + cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch; + } + + anv_batch_bo_start(batch_bo, &cmd_buffer->batch, + GFX8_MI_BATCH_BUFFER_START_length * 4); + + int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8, + sizeof(struct anv_bo *)); + if (!success) + goto fail_batch_bo; + + *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo; + + success = u_vector_init(&cmd_buffer->bt_block_states, 8, + sizeof(struct anv_state)); + if (!success) + goto fail_seen_bbos; + + result = anv_reloc_list_init(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc); + if (result != VK_SUCCESS) + goto fail_bt_blocks; + cmd_buffer->last_ss_pool_center = 0; + + result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + if (result != VK_SUCCESS) + goto fail_bt_blocks; + + return VK_SUCCESS; + + fail_bt_blocks: + u_vector_finish(&cmd_buffer->bt_block_states); + fail_seen_bbos: + u_vector_finish(&cmd_buffer->seen_bbos); + fail_batch_bo: + anv_batch_bo_destroy(batch_bo, cmd_buffer); + + return result; +} + +void +anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_state *bt_block; + u_vector_foreach(bt_block, &cmd_buffer->bt_block_states) + anv_binding_table_pool_free(cmd_buffer->device, *bt_block); + u_vector_finish(&cmd_buffer->bt_block_states); + + anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->vk.pool->alloc); + + u_vector_finish(&cmd_buffer->seen_bbos); + + /* Destroy all of the batch buffers */ + list_for_each_entry_safe(struct anv_batch_bo, bbo, + &cmd_buffer->batch_bos, link) { + list_del(&bbo->link); + anv_batch_bo_destroy(bbo, cmd_buffer); + } +} + +void +anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) +{ + /* Delete all but the first batch bo */ + assert(!list_is_empty(&cmd_buffer->batch_bos)); + while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) { + struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); + list_del(&bbo->link); + anv_batch_bo_destroy(bbo, cmd_buffer); + } + assert(!list_is_empty(&cmd_buffer->batch_bos)); + + anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer), + &cmd_buffer->batch, + GFX8_MI_BATCH_BUFFER_START_length * 4); + + while (u_vector_length(&cmd_buffer->bt_block_states) > 1) { + struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states); + anv_binding_table_pool_free(cmd_buffer->device, *bt_block); + } + assert(u_vector_length(&cmd_buffer->bt_block_states) == 1); + cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states); + cmd_buffer->bt_next.offset = 0; + + anv_reloc_list_clear(&cmd_buffer->surface_relocs); + cmd_buffer->last_ss_pool_center = 0; + + /* Reset the list of seen buffers */ + cmd_buffer->seen_bbos.head = 0; + cmd_buffer->seen_bbos.tail = 0; + + struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer); + + *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo; + + + assert(!cmd_buffer->device->can_chain_batches || + first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE); + cmd_buffer->total_batch_size = first_bbo->bo->size; +} + +void +anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); + + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + /* When we start a batch buffer, we subtract a certain amount of + * padding from the end to ensure that we always have room to emit a + * BATCH_BUFFER_START to chain to the next BO. We need to remove + * that padding before we end the batch; otherwise, we may end up + * with our BATCH_BUFFER_END in another BO. + */ + cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4; + assert(cmd_buffer->batch.start == batch_bo->bo->map); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); + + /* Save end instruction location to override it later. */ + cmd_buffer->batch_end = cmd_buffer->batch.next; + + /* If we can chain this command buffer to another one, leave some place + * for the jump instruction. + */ + batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer); + if (batch_bo->chained) + emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); + else + anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe); + + /* Round batch up to an even number of dwords. */ + if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4) + anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); + + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY; + } else { + assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + /* If this is a secondary command buffer, we need to determine the + * mode in which it will be executed with vkExecuteCommands. We + * determine this statically here so that this stays in sync with the + * actual ExecuteCommands implementation. + */ + const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start; + if (!cmd_buffer->device->can_chain_batches) { + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT; + } else if (cmd_buffer->device->physical->use_call_secondary) { + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN; + /* If the secondary command buffer begins & ends in the same BO and + * its length is less than the length of CS prefetch, add some NOOPs + * instructions so the last MI_BATCH_BUFFER_START is outside the CS + * prefetch. + */ + if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) { + const struct intel_device_info *devinfo = cmd_buffer->device->info; + /* Careful to have everything in signed integer. */ + int32_t prefetch_len = devinfo->cs_prefetch_size; + int32_t batch_len = + cmd_buffer->batch.next - cmd_buffer->batch.start; + + for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4) + anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop); + } + + void *jump_addr = + anv_batch_emitn(&cmd_buffer->batch, + GFX8_MI_BATCH_BUFFER_START_length, + GFX8_MI_BATCH_BUFFER_START, + .AddressSpaceIndicator = ASI_PPGTT, + .SecondLevelBatchBuffer = Firstlevelbatch) + + (GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8); + cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr); + + /* The emit above may have caused us to chain batch buffers which + * would mean that batch_bo is no longer valid. + */ + batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer); + } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) && + (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) { + /* If the secondary has exactly one batch buffer in its list *and* + * that batch buffer is less than half of the maximum size, we're + * probably better of simply copying it into our batch. + */ + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT; + } else if (!(cmd_buffer->usage_flags & + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) { + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN; + + /* In order to chain, we need this command buffer to contain an + * MI_BATCH_BUFFER_START which will jump back to the calling batch. + * It doesn't matter where it points now so long as has a valid + * relocation. We'll adjust it later as part of the chaining + * process. + * + * We set the end of the batch a little short so we would be sure we + * have room for the chaining command. Since we're about to emit the + * chaining command, let's set it back where it should go. + */ + cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4; + assert(cmd_buffer->batch.start == batch_bo->bo->map); + assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size); + + emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0); + assert(cmd_buffer->batch.start == batch_bo->bo->map); + } else { + cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN; + } + } + + anv_batch_bo_finish(batch_bo, &cmd_buffer->batch); +} + +static VkResult +anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer, + struct list_head *list) +{ + list_for_each_entry(struct anv_batch_bo, bbo, list, link) { + struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos); + if (bbo_ptr == NULL) + return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY); + + *bbo_ptr = bbo; + } + + return VK_SUCCESS; +} + +void +anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, + struct anv_cmd_buffer *secondary) +{ + anv_measure_add_secondary(primary, secondary); + switch (secondary->exec_mode) { + case ANV_CMD_BUFFER_EXEC_MODE_EMIT: + anv_batch_emit_batch(&primary->batch, &secondary->batch); + break; + case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: { + struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary); + unsigned length = secondary->batch.end - secondary->batch.start; + anv_batch_bo_grow(primary, bbo, &primary->batch, length, + GFX8_MI_BATCH_BUFFER_START_length * 4); + anv_batch_emit_batch(&primary->batch, &secondary->batch); + break; + } + case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: { + struct anv_batch_bo *first_bbo = + list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); + struct anv_batch_bo *last_bbo = + list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link); + + emit_batch_buffer_start(primary, first_bbo->bo, 0); + + struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary); + assert(primary->batch.start == this_bbo->bo->map); + uint32_t offset = primary->batch.next - primary->batch.start; + + /* Make the tail of the secondary point back to right after the + * MI_BATCH_BUFFER_START in the primary batch. + */ + anv_batch_bo_link(primary, last_bbo, this_bbo, offset); + + anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); + break; + } + case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: { + struct list_head copy_list; + VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos, + secondary, + ©_list); + if (result != VK_SUCCESS) + return; /* FIXME */ + + anv_cmd_buffer_add_seen_bbos(primary, ©_list); + + struct anv_batch_bo *first_bbo = + list_first_entry(©_list, struct anv_batch_bo, link); + struct anv_batch_bo *last_bbo = + list_last_entry(©_list, struct anv_batch_bo, link); + + cmd_buffer_chain_to_batch_bo(primary, first_bbo); + + list_splicetail(©_list, &primary->batch_bos); + + anv_batch_bo_continue(last_bbo, &primary->batch, + GFX8_MI_BATCH_BUFFER_START_length * 4); + break; + } + case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: { + struct anv_batch_bo *first_bbo = + list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link); + + uint64_t *write_return_addr = + anv_batch_emitn(&primary->batch, + GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */, + GFX8_MI_STORE_DATA_IMM, + .Address = secondary->return_addr) + + (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8); + + emit_batch_buffer_start(primary, first_bbo->bo, 0); + + *write_return_addr = + anv_address_physical(anv_batch_address(&primary->batch, + primary->batch.next)); + + anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos); + break; + } + default: + assert(!"Invalid execution mode"); + } + + anv_reloc_list_append(&primary->surface_relocs, &primary->vk.pool->alloc, + &secondary->surface_relocs, 0); +} + +struct anv_execbuf { + struct drm_i915_gem_execbuffer2 execbuf; + + struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; + + struct drm_i915_gem_exec_object2 * objects; + uint32_t bo_count; + struct anv_bo ** bos; + + /* Allocated length of the 'objects' and 'bos' arrays */ + uint32_t array_length; + + uint32_t syncobj_count; + uint32_t syncobj_array_length; + struct drm_i915_gem_exec_fence * syncobjs; + uint64_t * syncobj_values; + + /* List of relocations for surface states, only used with platforms not + * using softpin. + */ + void * surface_states_relocs; + + uint32_t cmd_buffer_count; + struct anv_query_pool *perf_query_pool; + + /* Indicates whether any of the command buffers have relocations. This + * doesn't not necessarily mean we'll need the kernel to process them. It + * might be that a previous execbuf has already placed things in the VMA + * and we can make i915 skip the relocations. + */ + bool has_relocs; + + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; + + int perf_query_pass; +}; + +static void +anv_execbuf_finish(struct anv_execbuf *exec) +{ + vk_free(exec->alloc, exec->syncobjs); + vk_free(exec->alloc, exec->syncobj_values); + vk_free(exec->alloc, exec->surface_states_relocs); + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); +} + +static void +anv_execbuf_add_ext(struct anv_execbuf *exec, + uint32_t ext_name, + struct i915_user_extension *ext) +{ + __u64 *iter = &exec->execbuf.cliprects_ptr; + + exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS; + + while (*iter != 0) { + iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension; + } + + ext->name = ext_name; + + *iter = (uintptr_t) ext; +} + +static VkResult +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags); + +static VkResult +anv_execbuf_add_bo(struct anv_device *device, + struct anv_execbuf *exec, + struct anv_bo *bo, + struct anv_reloc_list *relocs, + uint32_t extra_flags) +{ + struct drm_i915_gem_exec_object2 *obj = NULL; + + bo = anv_bo_unwrap(bo); + + if (bo->exec_obj_index < exec->bo_count && + exec->bos[bo->exec_obj_index] == bo) + obj = &exec->objects[bo->exec_obj_index]; + + if (obj == NULL) { + /* We've never seen this one before. Add it to the list and assign + * an id that we can use later. + */ + if (exec->bo_count >= exec->array_length) { + uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; + + struct drm_i915_gem_exec_object2 *new_objects = + vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope); + if (new_objects == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct anv_bo **new_bos = + vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope); + if (new_bos == NULL) { + vk_free(exec->alloc, new_objects); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + if (exec->objects) { + memcpy(new_objects, exec->objects, + exec->bo_count * sizeof(*new_objects)); + memcpy(new_bos, exec->bos, + exec->bo_count * sizeof(*new_bos)); + } + + vk_free(exec->alloc, exec->objects); + vk_free(exec->alloc, exec->bos); + + exec->objects = new_objects; + exec->bos = new_bos; + exec->array_length = new_len; + } + + assert(exec->bo_count < exec->array_length); + + bo->exec_obj_index = exec->bo_count++; + obj = &exec->objects[bo->exec_obj_index]; + exec->bos[bo->exec_obj_index] = bo; + + obj->handle = bo->gem_handle; + obj->relocation_count = 0; + obj->relocs_ptr = 0; + obj->alignment = 0; + obj->offset = bo->offset; + obj->flags = bo->flags | extra_flags; + obj->rsvd1 = 0; + obj->rsvd2 = 0; + } + + if (extra_flags & EXEC_OBJECT_WRITE) { + obj->flags |= EXEC_OBJECT_WRITE; + obj->flags &= ~EXEC_OBJECT_ASYNC; + } + + if (relocs != NULL) { + assert(obj->relocation_count == 0); + + if (relocs->num_relocs > 0) { + /* This is the first time we've ever seen a list of relocations for + * this BO. Go ahead and set the relocations and then walk the list + * of relocations and add them all. + */ + exec->has_relocs = true; + obj->relocation_count = relocs->num_relocs; + obj->relocs_ptr = (uintptr_t) relocs->relocs; + + for (size_t i = 0; i < relocs->num_relocs; i++) { + VkResult result; + + /* A quick sanity check on relocations */ + assert(relocs->relocs[i].offset < bo->size); + result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i], + NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } + } + + return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words, + relocs->deps, extra_flags); + } + + return VK_SUCCESS; +} + +/* Add BO dependencies to execbuf */ +static VkResult +anv_execbuf_add_bo_bitset(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t dep_words, + BITSET_WORD *deps, + uint32_t extra_flags) +{ + for (uint32_t w = 0; w < dep_words; w++) { + BITSET_WORD mask = deps[w]; + while (mask) { + int i = u_bit_scan(&mask); + uint32_t gem_handle = w * BITSET_WORDBITS + i; + struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle); + assert(bo->refcount > 0); + VkResult result = + anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags); + if (result != VK_SUCCESS) + return result; + } + } + + return VK_SUCCESS; +} + +static void +anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, + struct anv_reloc_list *list) +{ + for (size_t i = 0; i < list->num_relocs; i++) { + list->relocs[i].target_handle = + anv_bo_unwrap(list->reloc_bos[i])->exec_obj_index; + } +} + +static void +adjust_relocations_from_state_pool(struct anv_state_pool *pool, + struct anv_reloc_list *relocs, + uint32_t last_pool_center_bo_offset) +{ + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; + + for (size_t i = 0; i < relocs->num_relocs; i++) { + /* All of the relocations from this block pool to other BO's should + * have been emitted relative to the surface block pool center. We + * need to add the center offset to make them relative to the + * beginning of the actual GEM bo. + */ + relocs->relocs[i].offset += delta; + } +} + +static void +adjust_relocations_to_state_pool(struct anv_state_pool *pool, + struct anv_bo *from_bo, + struct anv_reloc_list *relocs, + uint32_t last_pool_center_bo_offset) +{ + assert(!from_bo->is_wrapper); + assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset); + uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset; + + /* When we initially emit relocations into a block pool, we don't + * actually know what the final center_bo_offset will be so we just emit + * it as if center_bo_offset == 0. Now that we know what the center + * offset is, we need to walk the list of relocations and adjust any + * relocations that point to the pool bo with the correct offset. + */ + for (size_t i = 0; i < relocs->num_relocs; i++) { + if (relocs->reloc_bos[i] == pool->block_pool.bo) { + /* Adjust the delta value in the relocation to correctly + * correspond to the new delta. Initially, this value may have + * been negative (if treated as unsigned), but we trust in + * uint32_t roll-over to fix that for us at this point. + */ + relocs->relocs[i].delta += delta; + + /* Since the delta has changed, we need to update the actual + * relocated value with the new presumed value. This function + * should only be called on batch buffers, so we know it isn't in + * use by the GPU at the moment. + */ + assert(relocs->relocs[i].offset < from_bo->size); + write_reloc(pool->block_pool.device, + from_bo->map + relocs->relocs[i].offset, + relocs->relocs[i].presumed_offset + + relocs->relocs[i].delta, false); + } + } +} + +static void +anv_reloc_list_apply(struct anv_device *device, + struct anv_reloc_list *list, + struct anv_bo *bo, + bool always_relocate) +{ + bo = anv_bo_unwrap(bo); + + for (size_t i = 0; i < list->num_relocs; i++) { + struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]); + if (list->relocs[i].presumed_offset == target_bo->offset && + !always_relocate) + continue; + + void *p = bo->map + list->relocs[i].offset; + write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); + list->relocs[i].presumed_offset = target_bo->offset; + } +} + +/** + * This function applies the relocation for a command buffer and writes the + * actual addresses into the buffers as per what we were told by the kernel on + * the previous execbuf2 call. This should be safe to do because, for each + * relocated address, we have two cases: + * + * 1) The target BO is inactive (as seen by the kernel). In this case, it is + * not in use by the GPU so updating the address is 100% ok. It won't be + * in-use by the GPU (from our context) again until the next execbuf2 + * happens. If the kernel decides to move it in the next execbuf2, it + * will have to do the relocations itself, but that's ok because it should + * have all of the information needed to do so. + * + * 2) The target BO is active (as seen by the kernel). In this case, it + * hasn't moved since the last execbuffer2 call because GTT shuffling + * *only* happens when the BO is idle. (From our perspective, it only + * happens inside the execbuffer2 ioctl, but the shuffling may be + * triggered by another ioctl, with full-ppgtt this is limited to only + * execbuffer2 ioctls on the same context, or memory pressure.) Since the + * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT + * address and the relocated value we are writing into the BO will be the + * same as the value that is already there. + * + * There is also a possibility that the target BO is active but the exact + * RENDER_SURFACE_STATE object we are writing the relocation into isn't in + * use. In this case, the address currently in the RENDER_SURFACE_STATE + * may be stale but it's still safe to write the relocation because that + * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and + * won't be until the next execbuf2 call. + * + * By doing relocations on the CPU, we can tell the kernel that it doesn't + * need to bother. We want to do this because the surface state buffer is + * used by every command buffer so, if the kernel does the relocations, it + * will always be busy and the kernel will always stall. This is also + * probably the fastest mechanism for doing relocations since the kernel would + * have to make a full copy of all the relocations lists. + */ +static bool +execbuf_can_skip_relocations(struct anv_execbuf *exec) +{ + if (!exec->has_relocs) + return true; + + static int userspace_relocs = -1; + if (userspace_relocs < 0) + userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); + if (!userspace_relocs) + return false; + + /* First, we have to check to see whether or not we can even do the + * relocation. New buffers which have never been submitted to the kernel + * don't have a valid offset so we need to let the kernel do relocations so + * that we can get offsets for them. On future execbuf2 calls, those + * buffers will have offsets and we will be able to skip relocating. + * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. + */ + for (uint32_t i = 0; i < exec->bo_count; i++) { + assert(!exec->bos[i]->is_wrapper); + if (exec->bos[i]->offset == (uint64_t)-1) + return false; + } + + return true; +} + +static void +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_execbuf *exec) +{ + /* Since surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't know + * what address is actually written in the surface state object at any + * given time. The only option is to always relocate them. + */ + struct anv_bo *surface_state_bo = + anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo); + anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, + surface_state_bo, + true /* always relocate surface states */); + + /* Since we own all of the batch buffers, we know what values are stored + * in the relocated addresses and only have to update them if the offsets + * have changed. + */ + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + anv_reloc_list_apply(cmd_buffer->device, + &(*bbo)->relocs, (*bbo)->bo, false); + } + + for (uint32_t i = 0; i < exec->bo_count; i++) + exec->objects[i].offset = exec->bos[i]->offset; +} + +static void +reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer) +{ + /* In the case where we fall back to doing kernel relocations, we need to + * ensure that the relocation list is valid. All relocations on the batch + * buffers are already valid and kept up-to-date. Since surface states are + * shared between command buffers and we don't know what order they will be + * submitted to the kernel, we don't know what address is actually written + * in the surface state object at any given time. The only option is to set + * a bogus presumed offset and let the kernel relocate them. + */ + for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) + cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; +} + +static VkResult +anv_execbuf_add_syncobj(struct anv_device *device, + struct anv_execbuf *exec, + uint32_t syncobj, + uint32_t flags, + uint64_t timeline_value) +{ + if (exec->syncobj_count >= exec->syncobj_array_length) { + uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16); + + struct drm_i915_gem_exec_fence *new_syncobjs = + vk_alloc(exec->alloc, new_len * sizeof(*new_syncobjs), + 8, exec->alloc_scope); + if (!new_syncobjs) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + if (exec->syncobjs) + typed_memcpy(new_syncobjs, exec->syncobjs, exec->syncobj_count); + + exec->syncobjs = new_syncobjs; + + if (exec->syncobj_values) { + uint64_t *new_syncobj_values = + vk_alloc(exec->alloc, new_len * sizeof(*new_syncobj_values), + 8, exec->alloc_scope); + if (!new_syncobj_values) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + typed_memcpy(new_syncobj_values, exec->syncobj_values, + exec->syncobj_count); + + exec->syncobj_values = new_syncobj_values; + } + + exec->syncobj_array_length = new_len; + } + + if (timeline_value && !exec->syncobj_values) { + exec->syncobj_values = + vk_zalloc(exec->alloc, exec->syncobj_array_length * + sizeof(*exec->syncobj_values), + 8, exec->alloc_scope); + if (!exec->syncobj_values) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) { + .handle = syncobj, + .flags = flags, + }; + if (timeline_value) + exec->syncobj_values[exec->syncobj_count] = timeline_value; + + exec->syncobj_count++; + + return VK_SUCCESS; +} + +static VkResult +anv_execbuf_add_sync(struct anv_device *device, + struct anv_execbuf *execbuf, + struct vk_sync *sync, + bool is_signal, + uint64_t value) +{ + /* It's illegal to signal a timeline with value 0 because that's never + * higher than the current value. A timeline wait on value 0 is always + * trivial because 0 <= uint64_t always. + */ + if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0) + return VK_SUCCESS; + + if (vk_sync_is_anv_bo_sync(sync)) { + struct anv_bo_sync *bo_sync = + container_of(sync, struct anv_bo_sync, sync); + + assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET)); + + return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL, + is_signal ? EXEC_OBJECT_WRITE : 0); + } else if (vk_sync_type_is_drm_syncobj(sync->type)) { + struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync); + + if (!(sync->flags & VK_SYNC_IS_TIMELINE)) + value = 0; + + return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj, + is_signal ? I915_EXEC_FENCE_SIGNAL : + I915_EXEC_FENCE_WAIT, + value); + } + + unreachable("Invalid sync type"); +} + +static VkResult +setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf, + struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_state_pool *ss_pool = + &cmd_buffer->device->surface_state_pool; + + adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, + cmd_buffer->last_ss_pool_center); + VkResult result; + if (anv_use_relocations(cmd_buffer->device->physical)) { + /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs + * will get added automatically by processing relocations on the batch + * buffer. We have to add the surface state BO manually because it has + * relocations of its own that we need to be sure are processed. + */ + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + ss_pool->block_pool.bo, + &cmd_buffer->surface_relocs, 0); + if (result != VK_SUCCESS) + return result; + } else { + /* Add surface dependencies (BOs) to the execbuf */ + anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf, + cmd_buffer->surface_relocs.dep_words, + cmd_buffer->surface_relocs.deps, 0); + } + + /* First, we walk over all of the bos we've seen and add them and their + * relocations to the validate list. + */ + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs, + cmd_buffer->last_ss_pool_center); + + result = anv_execbuf_add_bo(cmd_buffer->device, execbuf, + (*bbo)->bo, &(*bbo)->relocs, 0); + if (result != VK_SUCCESS) + return result; + } + + /* Now that we've adjusted all of the surface state relocations, we need to + * record the surface state pool center so future executions of the command + * buffer can adjust correctly. + */ + cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset; + + return VK_SUCCESS; +} + +static void +chain_command_buffers(struct anv_cmd_buffer **cmd_buffers, + uint32_t num_cmd_buffers) +{ + if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) { + assert(num_cmd_buffers == 1); + return; + } + + /* Chain the N-1 first batch buffers */ + for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) + anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]); + + /* Put an end to the last one */ + anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]); +} + +static VkResult +setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf, + struct anv_queue *queue, + struct anv_cmd_buffer **cmd_buffers, + uint32_t num_cmd_buffers) +{ + struct anv_device *device = queue->device; + struct anv_state_pool *ss_pool = &device->surface_state_pool; + VkResult result; + + /* Edit the tail of the command buffers to chain them all together if they + * can be. + */ + chain_command_buffers(cmd_buffers, num_cmd_buffers); + + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + anv_measure_submit(cmd_buffers[i]); + result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]); + if (result != VK_SUCCESS) + return result; + } + + /* Add all the global BOs to the object list for softpin case. */ + if (!anv_use_relocations(device->physical)) { + anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + struct anv_block_pool *pool; + pool = &device->dynamic_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &device->general_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &device->instruction_state_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + pool = &device->binding_table_pool.block_pool; + anv_block_pool_foreach_bo(bo, pool) { + result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + + /* Add the BOs for all user allocated memory objects because we can't + * track after binding updates of VK_EXT_descriptor_indexing. + */ + list_for_each_entry(struct anv_device_memory, mem, + &device->memory_objects, link) { + result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0); + if (result != VK_SUCCESS) + return result; + } + } else { + /* We do not support chaining primary command buffers without + * softpin. + */ + assert(num_cmd_buffers == 1); + } + + bool no_reloc = true; + if (execbuf->has_relocs) { + no_reloc = execbuf_can_skip_relocations(execbuf); + if (no_reloc) { + /* If we were able to successfully relocate everything, tell the + * kernel that it can skip doing relocations. The requirement for + * using NO_RELOC is: + * + * 1) The addresses written in the objects must match the + * corresponding reloc.presumed_offset which in turn must match + * the corresponding execobject.offset. + * + * 2) To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * In order to satisfy all of the invariants that make userspace + * relocations to be safe (see relocate_cmd_buffer()), we need to + * further ensure that the addresses we use match those used by the + * kernel for the most recent execbuf2. + * + * The kernel may still choose to do relocations anyway if something + * has moved in the GTT. In this case, the relocation list still + * needs to be valid. All relocations on the batch buffers are + * already valid and kept up-to-date. For surface state relocations, + * by applying the relocations in relocate_cmd_buffer, we ensured + * that the address in the RENDER_SURFACE_STATE matches + * presumed_offset, so it should be safe for the kernel to relocate + * them as needed. + */ + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + relocate_cmd_buffer(cmd_buffers[i], execbuf); + + anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs, + device->surface_state_pool.block_pool.bo, + true /* always relocate surface states */); + } + } else { + /* In the case where we fall back to doing kernel relocations, we + * need to ensure that the relocation list is valid. All relocations + * on the batch buffers are already valid and kept up-to-date. Since + * surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't + * know what address is actually written in the surface state object + * at any given time. The only option is to set a bogus presumed + * offset and let the kernel relocate them. + */ + for (uint32_t i = 0; i < num_cmd_buffers; i++) + reset_cmd_buffer_surface_offsets(cmd_buffers[i]); + } + } + + struct anv_batch_bo *first_batch_bo = + list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link); + + /* The kernel requires that the last entry in the validation list be the + * batch buffer to execute. We can simply swap the element + * corresponding to the first batch_bo in the chain with the last + * element in the list. + */ + if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) { + uint32_t idx = first_batch_bo->bo->exec_obj_index; + uint32_t last_idx = execbuf->bo_count - 1; + + struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; + assert(execbuf->bos[idx] == first_batch_bo->bo); + + execbuf->objects[idx] = execbuf->objects[last_idx]; + execbuf->bos[idx] = execbuf->bos[last_idx]; + execbuf->bos[idx]->exec_obj_index = idx; + + execbuf->objects[last_idx] = tmp_obj; + execbuf->bos[last_idx] = first_batch_bo->bo; + first_batch_bo->bo->exec_obj_index = last_idx; + } + + /* If we are pinning our BOs, we shouldn't have to relocate anything */ + if (!anv_use_relocations(device->physical)) + assert(!execbuf->has_relocs); + + /* Now we go through and fixup all of the relocation lists to point to the + * correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to + * do this after we reorder the list above as some of the indices may have + * changed. + */ + struct anv_batch_bo **bbo; + if (execbuf->has_relocs) { + assert(num_cmd_buffers == 1); + u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos) + anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs); + + anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs); + } + + if (device->physical->memory.need_clflush) { + __builtin_ia32_mfence(); + for (uint32_t i = 0; i < num_cmd_buffers; i++) { + u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) { + for (uint32_t l = 0; l < (*bbo)->length; l += CACHELINE_SIZE) + __builtin_ia32_clflush((*bbo)->bo->map + l); + } + } + } + + struct anv_batch *batch = &cmd_buffers[0]->batch; + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, + .batch_start_offset = 0, + /* On platforms that cannot chain batch buffers because of the i915 + * command parser, we have to provide the batch length. Everywhere else + * we'll chain batches so no point in passing a length. + */ + .batch_len = device->can_chain_batches ? 0 : batch->next - batch->start, + .cliprects_ptr = 0, + .num_cliprects = 0, + .DR1 = 0, + .DR4 = 0, + .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0), + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + + return VK_SUCCESS; +} + +static VkResult +setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue) +{ + struct anv_device *device = queue->device; + VkResult result = anv_execbuf_add_bo(device, execbuf, + device->trivial_batch_bo, + NULL, 0); + if (result != VK_SUCCESS) + return result; + + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, + .batch_start_offset = 0, + .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */ + .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + + return VK_SUCCESS; +} + +static VkResult +setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue, + struct anv_utrace_flush_copy *flush) +{ + struct anv_device *device = queue->device; + VkResult result = anv_execbuf_add_bo(device, execbuf, + flush->batch_bo, + &flush->relocs, 0); + if (result != VK_SUCCESS) + return result; + + result = anv_execbuf_add_sync(device, execbuf, flush->sync, + true /* is_signal */, 0 /* value */); + if (result != VK_SUCCESS) + return result; + + if (flush->batch_bo->exec_obj_index != execbuf->bo_count - 1) { + uint32_t idx = flush->batch_bo->exec_obj_index; + uint32_t last_idx = execbuf->bo_count - 1; + + struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; + assert(execbuf->bos[idx] == flush->batch_bo); + + execbuf->objects[idx] = execbuf->objects[last_idx]; + execbuf->bos[idx] = execbuf->bos[last_idx]; + execbuf->bos[idx]->exec_obj_index = idx; + + execbuf->objects[last_idx] = tmp_obj; + execbuf->bos[last_idx] = flush->batch_bo; + flush->batch_bo->exec_obj_index = last_idx; + } + + if (device->physical->memory.need_clflush) + intel_flush_range(flush->batch_bo->map, flush->batch_bo->size); + + execbuf->execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf->objects, + .buffer_count = execbuf->bo_count, + .batch_start_offset = 0, + .batch_len = flush->batch.next - flush->batch.start, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags | + (execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC), + .rsvd1 = device->context_id, + .rsvd2 = 0, + .num_cliprects = execbuf->syncobj_count, + .cliprects_ptr = (uintptr_t)execbuf->syncobjs, + }; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_exec_utrace_locked(struct anv_queue *queue, + struct anv_utrace_flush_copy *flush) +{ + assert(flush->batch_bo); + + struct anv_device *device = queue->device; + struct anv_execbuf execbuf = { + .alloc = &device->vk.alloc, + .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, + }; + + VkResult result = setup_utrace_execbuf(&execbuf, queue, flush); + if (result != VK_SUCCESS) + goto error; + + int ret = queue->device->info->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) + result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); + + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (anv_bo_is_pinned(execbuf.bos[k])) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; + } + + error: + anv_execbuf_finish(&execbuf); + + return result; +} + +/* We lock around execbuf for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a different + * center offset but we re-use the same anv_bo struct when we do so. If + * this happens in the middle of setting up an execbuf, we could end up + * with our list of BOs out of sync with our list of gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to synchronize around + * QueueSubmit, this would be extremely difficult to debug if it ever came + * up in the wild due to a broken app. It's better to play it safe and + * just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to ensure + * this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ +static VkResult +anv_queue_exec_locked(struct anv_queue *queue, + uint32_t wait_count, + const struct vk_sync_wait *waits, + uint32_t cmd_buffer_count, + struct anv_cmd_buffer **cmd_buffers, + uint32_t signal_count, + const struct vk_sync_signal *signals, + struct anv_query_pool *perf_query_pool, + uint32_t perf_query_pass) +{ + struct anv_device *device = queue->device; + struct anv_utrace_flush_copy *utrace_flush_data = NULL; + struct anv_execbuf execbuf = { + .alloc = &queue->device->vk.alloc, + .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, + .perf_query_pass = perf_query_pass, + }; + + /* Flush the trace points first, they need to be moved */ + VkResult result = + anv_device_utrace_flush_cmd_buffers(queue, + cmd_buffer_count, + cmd_buffers, + &utrace_flush_data); + if (result != VK_SUCCESS) + goto error; + + if (utrace_flush_data && !utrace_flush_data->batch_bo) { + result = anv_execbuf_add_sync(device, &execbuf, + utrace_flush_data->sync, + true /* is_signal */, + 0); + if (result != VK_SUCCESS) + goto error; + + utrace_flush_data = NULL; + } + + /* Always add the workaround BO as it includes a driver identifier for the + * error_state. + */ + result = + anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; + + for (uint32_t i = 0; i < wait_count; i++) { + result = anv_execbuf_add_sync(device, &execbuf, + waits[i].sync, + false /* is_signal */, + waits[i].wait_value); + if (result != VK_SUCCESS) + goto error; + } + + for (uint32_t i = 0; i < signal_count; i++) { + result = anv_execbuf_add_sync(device, &execbuf, + signals[i].sync, + true /* is_signal */, + signals[i].signal_value); + if (result != VK_SUCCESS) + goto error; + } + + if (queue->sync) { + result = anv_execbuf_add_sync(device, &execbuf, + queue->sync, + true /* is_signal */, + 0 /* signal_value */); + if (result != VK_SUCCESS) + goto error; + } + + if (cmd_buffer_count) { + result = setup_execbuf_for_cmd_buffers(&execbuf, queue, + cmd_buffers, + cmd_buffer_count); + } else { + result = setup_empty_execbuf(&execbuf, queue); + } + + if (result != VK_SUCCESS) + goto error; + + const bool has_perf_query = + perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count; + + if (INTEL_DEBUG(DEBUG_SUBMIT)) { + fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n", + execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len); + for (uint32_t i = 0; i < execbuf.bo_count; i++) { + const struct anv_bo *bo = execbuf.bos[i]; + + fprintf(stderr, " BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=0x%010"PRIx64 + " handle=%05u name=%s\n", + bo->offset, bo->offset + bo->size - 1, bo->size, bo->gem_handle, bo->name); + } + } + + if (INTEL_DEBUG(DEBUG_BATCH)) { + fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues)); + if (cmd_buffer_count) { + if (has_perf_query) { + struct anv_bo *pass_batch_bo = perf_query_pool->bo; + uint64_t pass_batch_offset = + khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass); + + intel_print_batch(&device->decoder_ctx, + pass_batch_bo->map + pass_batch_offset, 64, + pass_batch_bo->offset + pass_batch_offset, false); + } + + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + struct anv_batch_bo **bo = + u_vector_tail(&cmd_buffers[i]->seen_bbos); + device->cmd_buffer_being_decoded = cmd_buffers[i]; + intel_print_batch(&device->decoder_ctx, (*bo)->bo->map, + (*bo)->bo->size, (*bo)->bo->offset, false); + device->cmd_buffer_being_decoded = NULL; + } + } else { + intel_print_batch(&device->decoder_ctx, + device->trivial_batch_bo->map, + device->trivial_batch_bo->size, + device->trivial_batch_bo->offset, false); + } + } + + if (execbuf.syncobj_values) { + execbuf.timeline_fences.fence_count = execbuf.syncobj_count; + execbuf.timeline_fences.handles_ptr = (uintptr_t)execbuf.syncobjs; + execbuf.timeline_fences.values_ptr = (uintptr_t)execbuf.syncobj_values; + anv_execbuf_add_ext(&execbuf, + DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES, + &execbuf.timeline_fences.base); + } else if (execbuf.syncobjs) { + execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.execbuf.num_cliprects = execbuf.syncobj_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)execbuf.syncobjs; + } + + if (has_perf_query) { + assert(perf_query_pass < perf_query_pool->n_passes); + struct intel_perf_query_info *query_info = + perf_query_pool->pass_query[perf_query_pass]; + + /* Some performance queries just the pipeline statistic HW, no need for + * OA in that case, so no need to reconfigure. + */ + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) && + (query_info->kind == INTEL_PERF_QUERY_TYPE_OA || + query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) { + int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) query_info->oa_metrics_set_id); + if (ret < 0) { + result = vk_device_set_lost(&device->vk, + "i915-perf config failed: %s", + strerror(errno)); + } + } + + struct anv_bo *pass_batch_bo = perf_query_pool->bo; + + struct drm_i915_gem_exec_object2 query_pass_object = { + .handle = pass_batch_bo->gem_handle, + .offset = pass_batch_bo->offset, + .flags = pass_batch_bo->flags, + }; + struct drm_i915_gem_execbuffer2 query_pass_execbuf = { + .buffers_ptr = (uintptr_t) &query_pass_object, + .buffer_count = 1, + .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool, + perf_query_pass), + .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags, + .rsvd1 = device->context_id, + }; + + int ret = queue->device->info->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &query_pass_execbuf); + if (ret) + result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); + } + + int ret = queue->device->info->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) + result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m"); + + if (result == VK_SUCCESS && queue->sync) { + result = vk_sync_wait(&device->vk, queue->sync, 0, + VK_SYNC_WAIT_COMPLETE, UINT64_MAX); + if (result != VK_SUCCESS) + result = vk_queue_set_lost(&queue->vk, "sync wait failed"); + } + + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (anv_bo_is_pinned(execbuf.bos[k])) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; + } + + error: + anv_execbuf_finish(&execbuf); + + if (result == VK_SUCCESS && utrace_flush_data) + result = anv_queue_exec_utrace_locked(queue, utrace_flush_data); + + return result; +} + +static inline bool +can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2) +{ + return (!p1 || !p2 || p1 == p2); +} + +static VkResult +anv_queue_submit_locked(struct anv_queue *queue, + struct vk_queue_submit *submit) +{ + VkResult result; + + if (submit->command_buffer_count == 0) { + result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits, + 0 /* cmd_buffer_count */, + NULL /* cmd_buffers */, + submit->signal_count, submit->signals, + NULL /* perf_query_pool */, + 0 /* perf_query_pass */); + if (result != VK_SUCCESS) + return result; + } else { + /* Everything's easier if we don't have to bother with container_of() */ + STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0); + struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers; + struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers; + uint32_t start = 0; + uint32_t end = submit->command_buffer_count; + struct anv_query_pool *perf_query_pool = + cmd_buffers[start]->perf_query_pool; + for (uint32_t n = 0; n < end; n++) { + bool can_chain = false; + uint32_t next = n + 1; + /* Can we chain the last buffer into the next one? */ + if (next < end && + anv_cmd_buffer_is_chainable(cmd_buffers[next]) && + can_chain_query_pools + (cmd_buffers[next]->perf_query_pool, perf_query_pool)) { + can_chain = true; + perf_query_pool = + perf_query_pool ? perf_query_pool : + cmd_buffers[next]->perf_query_pool; + } + if (!can_chain) { + /* The next buffer cannot be chained, or we have reached the + * last buffer, submit what have been chained so far. + */ + VkResult result = + anv_queue_exec_locked(queue, + start == 0 ? submit->wait_count : 0, + start == 0 ? submit->waits : NULL, + next - start, &cmd_buffers[start], + next == end ? submit->signal_count : 0, + next == end ? submit->signals : NULL, + perf_query_pool, + submit->perf_pass_index); + if (result != VK_SUCCESS) + return result; + if (next < end) { + start = next; + perf_query_pool = cmd_buffers[start]->perf_query_pool; + } + } + } + } + for (uint32_t i = 0; i < submit->signal_count; i++) { + if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync)) + continue; + + struct anv_bo_sync *bo_sync = + container_of(submit->signals[i].sync, struct anv_bo_sync, sync); + + /* Once the execbuf has returned, we need to set the fence state to + * SUBMITTED. We can't do this before calling execbuf because + * anv_GetFenceStatus does take the global device lock before checking + * fence->state. + * + * We set the fence state to SUBMITTED regardless of whether or not the + * execbuf succeeds because we need to ensure that vkWaitForFences() and + * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or + * VK_SUCCESS) in a finite amount of time even if execbuf fails. + */ + assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET); + bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED; + } + + pthread_cond_broadcast(&queue->device->queue_submit); + + return VK_SUCCESS; +} + +VkResult +anv_queue_submit(struct vk_queue *vk_queue, + struct vk_queue_submit *submit) +{ + struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk); + struct anv_device *device = queue->device; + VkResult result; + + if (queue->device->info->no_hw) { + for (uint32_t i = 0; i < submit->signal_count; i++) { + result = vk_sync_signal(&device->vk, + submit->signals[i].sync, + submit->signals[i].signal_value); + if (result != VK_SUCCESS) + return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed"); + } + return VK_SUCCESS; + } + + uint64_t start_ts = intel_ds_begin_submit(queue->ds); + + pthread_mutex_lock(&device->mutex); + result = anv_queue_submit_locked(queue, submit); + /* Take submission ID under lock */ + pthread_mutex_unlock(&device->mutex); + + intel_ds_end_submit(queue->ds, start_ts); + + return result; +} + +VkResult +anv_queue_submit_simple_batch(struct anv_queue *queue, + struct anv_batch *batch) +{ + struct anv_device *device = queue->device; + VkResult result = VK_SUCCESS; + int err; + + if (queue->device->info->no_hw) + return VK_SUCCESS; + + /* This is only used by device init so we can assume the queue is empty and + * we aren't fighting with a submit thread. + */ + assert(vk_queue_is_empty(&queue->vk)); + + uint32_t batch_size = align_u32(batch->next - batch->start, 8); + + struct anv_bo *batch_bo = NULL; + result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo); + if (result != VK_SUCCESS) + return result; + + memcpy(batch_bo->map, batch->start, batch_size); + if (device->physical->memory.need_clflush) + intel_flush_range(batch_bo->map, batch_size); + + struct anv_execbuf execbuf = { + .alloc = &queue->device->vk.alloc, + .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, + }; + + result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0); + if (result != VK_SUCCESS) + goto fail; + + if (INTEL_DEBUG(DEBUG_BATCH)) { + intel_print_batch(&device->decoder_ctx, + batch_bo->map, + batch_bo->size, + batch_bo->offset, false); + } + + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, + .batch_start_offset = 0, + .batch_len = batch_size, + .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + + err = anv_gem_execbuffer(device, &execbuf.execbuf); + if (err) { + result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m"); + goto fail; + } + + result = anv_device_wait(device, batch_bo, INT64_MAX); + if (result != VK_SUCCESS) { + result = vk_device_set_lost(&device->vk, + "anv_device_wait failed: %m"); + goto fail; + } + +fail: + anv_execbuf_finish(&execbuf); + anv_bo_pool_free(&device->batch_bo_pool, batch_bo); + + return result; +} diff --git a/src/intel/vulkan_hasvk/anv_blorp.c b/src/intel/vulkan_hasvk/anv_blorp.c new file mode 100644 index 00000000000..c829cb8aa46 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_blorp.c @@ -0,0 +1,1995 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +static bool +lookup_blorp_shader(struct blorp_batch *batch, + const void *key, uint32_t key_size, + uint32_t *kernel_out, void *prog_data_out) +{ + struct blorp_context *blorp = batch->blorp; + struct anv_device *device = blorp->driver_ctx; + + struct anv_shader_bin *bin = + anv_device_search_for_kernel(device, device->internal_cache, + key, key_size, NULL); + if (!bin) + return false; + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, bin); + + *kernel_out = bin->kernel.offset; + *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data; + + return true; +} + +static bool +upload_blorp_shader(struct blorp_batch *batch, uint32_t stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + uint32_t *kernel_out, void *prog_data_out) +{ + struct blorp_context *blorp = batch->blorp; + struct anv_device *device = blorp->driver_ctx; + + struct anv_pipeline_bind_map bind_map = { + .surface_count = 0, + .sampler_count = 0, + }; + + struct anv_shader_bin *bin = + anv_device_upload_kernel(device, device->internal_cache, stage, + key, key_size, kernel, kernel_size, + prog_data, prog_data_size, + NULL, 0, NULL, &bind_map); + + if (!bin) + return false; + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, bin); + + *kernel_out = bin->kernel.offset; + *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data; + + return true; +} + +void +anv_device_init_blorp(struct anv_device *device) +{ + const struct blorp_config config = { + .use_mesh_shading = device->physical->vk.supported_extensions.NV_mesh_shader, + }; + + blorp_init(&device->blorp, device, &device->isl_dev, &config); + device->blorp.compiler = device->physical->compiler; + device->blorp.lookup_shader = lookup_blorp_shader; + device->blorp.upload_shader = upload_blorp_shader; + switch (device->info->verx10) { + case 70: + device->blorp.exec = gfx7_blorp_exec; + break; + case 75: + device->blorp.exec = gfx75_blorp_exec; + break; + case 80: + device->blorp.exec = gfx8_blorp_exec; + break; + case 90: + device->blorp.exec = gfx9_blorp_exec; + break; + case 110: + device->blorp.exec = gfx11_blorp_exec; + break; + case 120: + device->blorp.exec = gfx12_blorp_exec; + break; + case 125: + device->blorp.exec = gfx125_blorp_exec; + break; + default: + unreachable("Unknown hardware generation"); + } +} + +void +anv_device_finish_blorp(struct anv_device *device) +{ + blorp_finish(&device->blorp); +} + +static void +anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, enum blorp_batch_flags flags) +{ + if (!(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT)) { + assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT); + flags |= BLORP_BATCH_USE_COMPUTE; + } + + blorp_batch_init(&cmd_buffer->device->blorp, batch, cmd_buffer, flags); +} + +static void +anv_blorp_batch_finish(struct blorp_batch *batch) +{ + blorp_batch_finish(batch); +} + +static void +get_blorp_surf_for_anv_buffer(struct anv_device *device, + struct anv_buffer *buffer, uint64_t offset, + uint32_t width, uint32_t height, + uint32_t row_pitch, enum isl_format format, + bool is_dest, + struct blorp_surf *blorp_surf, + struct isl_surf *isl_surf) +{ + bool ok UNUSED; + + *blorp_surf = (struct blorp_surf) { + .surf = isl_surf, + .addr = { + .buffer = buffer->address.bo, + .offset = buffer->address.offset + offset, + .mocs = anv_mocs(device, buffer->address.bo, + is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT + : ISL_SURF_USAGE_TEXTURE_BIT), + }, + }; + + ok = isl_surf_init(&device->isl_dev, isl_surf, + .dim = ISL_SURF_DIM_2D, + .format = format, + .width = width, + .height = height, + .depth = 1, + .levels = 1, + .array_len = 1, + .samples = 1, + .row_pitch_B = row_pitch, + .usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT + : ISL_SURF_USAGE_TEXTURE_BIT, + .tiling_flags = ISL_TILING_LINEAR_BIT); + assert(ok); +} + +/* Pick something high enough that it won't be used in core and low enough it + * will never map to an extension. + */ +#define ANV_IMAGE_LAYOUT_EXPLICIT_AUX (VkImageLayout)10000000 + +static struct blorp_address +anv_to_blorp_address(struct anv_address addr) +{ + return (struct blorp_address) { + .buffer = addr.bo, + .offset = addr.offset, + }; +} + +static void +get_blorp_surf_for_anv_image(const struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlags aspect, + VkImageUsageFlags usage, + VkImageLayout layout, + enum isl_aux_usage aux_usage, + struct blorp_surf *blorp_surf) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) { + assert(usage != 0); + aux_usage = anv_layout_to_aux_usage(device->info, image, + aspect, usage, layout); + } + + isl_surf_usage_flags_t mocs_usage = + (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) ? + ISL_SURF_USAGE_RENDER_TARGET_BIT : ISL_SURF_USAGE_TEXTURE_BIT; + + const struct anv_surface *surface = &image->planes[plane].primary_surface; + const struct anv_address address = + anv_image_address(image, &surface->memory_range); + + *blorp_surf = (struct blorp_surf) { + .surf = &surface->isl, + .addr = { + .buffer = address.bo, + .offset = address.offset, + .mocs = anv_mocs(device, address.bo, mocs_usage), + }, + }; + + if (aux_usage != ISL_AUX_USAGE_NONE) { + const struct anv_surface *aux_surface = &image->planes[plane].aux_surface; + const struct anv_address aux_address = + anv_image_address(image, &aux_surface->memory_range); + + blorp_surf->aux_usage = aux_usage; + blorp_surf->aux_surf = &aux_surface->isl; + + if (!anv_address_is_null(aux_address)) { + blorp_surf->aux_addr = (struct blorp_address) { + .buffer = aux_address.bo, + .offset = aux_address.offset, + .mocs = anv_mocs(device, aux_address.bo, 0), + }; + } + + /* If we're doing a partial resolve, then we need the indirect clear + * color. If we are doing a fast clear and want to store/update the + * clear color, we also pass the address to blorp, otherwise it will only + * stomp the CCS to a particular value and won't care about format or + * clear value + */ + if (aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + const struct anv_address clear_color_addr = + anv_image_get_clear_color_addr(device, image, aspect); + blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr); + } else if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) { + const struct anv_address clear_color_addr = + anv_image_get_clear_color_addr(device, image, aspect); + blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr); + blorp_surf->clear_color = (union isl_color_value) { + .f32 = { ANV_HZ_FC_VAL }, + }; + } + } +} + +static bool +get_blorp_surf_for_anv_shadow_image(const struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlags aspect, + struct blorp_surf *blorp_surf) +{ + + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + if (!anv_surface_is_valid(&image->planes[plane].shadow_surface)) + return false; + + const struct anv_surface *surface = &image->planes[plane].shadow_surface; + const struct anv_address address = + anv_image_address(image, &surface->memory_range); + + *blorp_surf = (struct blorp_surf) { + .surf = &surface->isl, + .addr = { + .buffer = address.bo, + .offset = address.offset, + .mocs = anv_mocs(device, address.bo, ISL_SURF_USAGE_RENDER_TARGET_BIT), + }, + }; + + return true; +} + +static void +copy_image(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, + struct anv_image *src_image, + VkImageLayout src_image_layout, + struct anv_image *dst_image, + VkImageLayout dst_image_layout, + const VkImageCopy2 *region) +{ + VkOffset3D srcOffset = + vk_image_sanitize_offset(&src_image->vk, region->srcOffset); + VkOffset3D dstOffset = + vk_image_sanitize_offset(&dst_image->vk, region->dstOffset); + VkExtent3D extent = + vk_image_sanitize_extent(&src_image->vk, region->extent); + + const uint32_t dst_level = region->dstSubresource.mipLevel; + unsigned dst_base_layer, layer_count; + if (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) { + dst_base_layer = region->dstOffset.z; + layer_count = region->extent.depth; + } else { + dst_base_layer = region->dstSubresource.baseArrayLayer; + layer_count = vk_image_subresource_layer_count(&dst_image->vk, + ®ion->dstSubresource); + } + + const uint32_t src_level = region->srcSubresource.mipLevel; + unsigned src_base_layer; + if (src_image->vk.image_type == VK_IMAGE_TYPE_3D) { + src_base_layer = region->srcOffset.z; + } else { + src_base_layer = region->srcSubresource.baseArrayLayer; + assert(layer_count == + vk_image_subresource_layer_count(&src_image->vk, + ®ion->srcSubresource)); + } + + VkImageAspectFlags src_mask = region->srcSubresource.aspectMask, + dst_mask = region->dstSubresource.aspectMask; + + assert(anv_image_aspects_compatible(src_mask, dst_mask)); + + if (util_bitcount(src_mask) > 1) { + anv_foreach_image_aspect_bit(aspect_bit, src_image, src_mask) { + struct blorp_surf src_surf, dst_surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, + src_image, 1UL << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + src_image_layout, ISL_AUX_USAGE_NONE, + &src_surf); + get_blorp_surf_for_anv_image(cmd_buffer->device, + dst_image, 1UL << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + dst_image_layout, ISL_AUX_USAGE_NONE, + &dst_surf); + anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, + 1UL << aspect_bit, + dst_surf.aux_usage, dst_level, + dst_base_layer, layer_count); + + for (unsigned i = 0; i < layer_count; i++) { + blorp_copy(batch, &src_surf, src_level, src_base_layer + i, + &dst_surf, dst_level, dst_base_layer + i, + srcOffset.x, srcOffset.y, + dstOffset.x, dstOffset.y, + extent.width, extent.height); + } + + struct blorp_surf dst_shadow_surf; + if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, + dst_image, + 1UL << aspect_bit, + &dst_shadow_surf)) { + for (unsigned i = 0; i < layer_count; i++) { + blorp_copy(batch, &src_surf, src_level, src_base_layer + i, + &dst_shadow_surf, dst_level, dst_base_layer + i, + srcOffset.x, srcOffset.y, + dstOffset.x, dstOffset.y, + extent.width, extent.height); + } + } + } + } else { + struct blorp_surf src_surf, dst_surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, src_mask, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + src_image_layout, ISL_AUX_USAGE_NONE, + &src_surf); + get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, dst_mask, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + dst_image_layout, ISL_AUX_USAGE_NONE, + &dst_surf); + anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, dst_mask, + dst_surf.aux_usage, dst_level, + dst_base_layer, layer_count); + + for (unsigned i = 0; i < layer_count; i++) { + blorp_copy(batch, &src_surf, src_level, src_base_layer + i, + &dst_surf, dst_level, dst_base_layer + i, + srcOffset.x, srcOffset.y, + dstOffset.x, dstOffset.y, + extent.width, extent.height); + } + + struct blorp_surf dst_shadow_surf; + if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, + dst_image, dst_mask, + &dst_shadow_surf)) { + for (unsigned i = 0; i < layer_count; i++) { + blorp_copy(batch, &src_surf, src_level, src_base_layer + i, + &dst_shadow_surf, dst_level, dst_base_layer + i, + srcOffset.x, srcOffset.y, + dstOffset.x, dstOffset.y, + extent.width, extent.height); + } + } + } +} + +void anv_CmdCopyImage2( + VkCommandBuffer commandBuffer, + const VkCopyImageInfo2* pCopyImageInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, src_image, pCopyImageInfo->srcImage); + ANV_FROM_HANDLE(anv_image, dst_image, pCopyImageInfo->dstImage); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { + copy_image(cmd_buffer, &batch, + src_image, pCopyImageInfo->srcImageLayout, + dst_image, pCopyImageInfo->dstImageLayout, + &pCopyImageInfo->pRegions[r]); + } + + anv_blorp_batch_finish(&batch); +} + +static enum isl_format +isl_format_for_size(unsigned size_B) +{ + /* Prefer 32-bit per component formats for CmdFillBuffer */ + switch (size_B) { + case 1: return ISL_FORMAT_R8_UINT; + case 2: return ISL_FORMAT_R16_UINT; + case 3: return ISL_FORMAT_R8G8B8_UINT; + case 4: return ISL_FORMAT_R32_UINT; + case 6: return ISL_FORMAT_R16G16B16_UINT; + case 8: return ISL_FORMAT_R32G32_UINT; + case 12: return ISL_FORMAT_R32G32B32_UINT; + case 16: return ISL_FORMAT_R32G32B32A32_UINT; + default: + unreachable("Unknown format size"); + } +} + +static void +copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, + struct anv_buffer *anv_buffer, + struct anv_image *anv_image, + VkImageLayout image_layout, + const VkBufferImageCopy2* region, + bool buffer_to_image) +{ + struct { + struct blorp_surf surf; + uint32_t level; + VkOffset3D offset; + } image, buffer, *src, *dst; + + buffer.level = 0; + buffer.offset = (VkOffset3D) { 0, 0, 0 }; + + if (buffer_to_image) { + src = &buffer; + dst = ℑ + } else { + src = ℑ + dst = &buffer; + } + + const VkImageAspectFlags aspect = region->imageSubresource.aspectMask; + + get_blorp_surf_for_anv_image(cmd_buffer->device, anv_image, aspect, + buffer_to_image ? + VK_IMAGE_USAGE_TRANSFER_DST_BIT : + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + image_layout, ISL_AUX_USAGE_NONE, + &image.surf); + image.offset = + vk_image_sanitize_offset(&anv_image->vk, region->imageOffset); + image.level = region->imageSubresource.mipLevel; + + VkExtent3D extent = + vk_image_sanitize_extent(&anv_image->vk, region->imageExtent); + if (anv_image->vk.image_type != VK_IMAGE_TYPE_3D) { + image.offset.z = region->imageSubresource.baseArrayLayer; + extent.depth = + vk_image_subresource_layer_count(&anv_image->vk, + ®ion->imageSubresource); + } + + const enum isl_format linear_format = + anv_get_isl_format(cmd_buffer->device->info, anv_image->vk.format, + aspect, VK_IMAGE_TILING_LINEAR); + const struct isl_format_layout *linear_fmtl = + isl_format_get_layout(linear_format); + + const struct vk_image_buffer_layout buffer_layout = + vk_image_buffer_copy_layout(&anv_image->vk, region); + + /* Some formats have additional restrictions which may cause ISL to + * fail to create a surface for us. For example, YCbCr formats + * have to have 2-pixel aligned strides. + * + * To avoid these issues, we always bind the buffer as if it's a + * "normal" format like RGBA32_UINT. Since we're using blorp_copy, + * the format doesn't matter as long as it has the right bpb. + */ + const VkExtent2D buffer_extent = { + .width = DIV_ROUND_UP(extent.width, linear_fmtl->bw), + .height = DIV_ROUND_UP(extent.height, linear_fmtl->bh), + }; + const enum isl_format buffer_format = + isl_format_for_size(linear_fmtl->bpb / 8); + + struct isl_surf buffer_isl_surf; + get_blorp_surf_for_anv_buffer(cmd_buffer->device, + anv_buffer, region->bufferOffset, + buffer_extent.width, buffer_extent.height, + buffer_layout.row_stride_B, buffer_format, + false, &buffer.surf, &buffer_isl_surf); + + bool dst_has_shadow = false; + struct blorp_surf dst_shadow_surf; + if (&image == dst) { + /* In this case, the source is the buffer and, since blorp takes its + * copy dimensions in terms of the source format, we have to use the + * scaled down version for compressed textures because the source + * format is an RGB format. + */ + extent.width = buffer_extent.width; + extent.height = buffer_extent.height; + + anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image, + aspect, dst->surf.aux_usage, + dst->level, + dst->offset.z, extent.depth); + + dst_has_shadow = + get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, + anv_image, aspect, + &dst_shadow_surf); + } + + for (unsigned z = 0; z < extent.depth; z++) { + blorp_copy(batch, &src->surf, src->level, src->offset.z, + &dst->surf, dst->level, dst->offset.z, + src->offset.x, src->offset.y, dst->offset.x, dst->offset.y, + extent.width, extent.height); + + if (dst_has_shadow) { + blorp_copy(batch, &src->surf, src->level, src->offset.z, + &dst_shadow_surf, dst->level, dst->offset.z, + src->offset.x, src->offset.y, + dst->offset.x, dst->offset.y, + extent.width, extent.height); + } + + image.offset.z++; + buffer.surf.addr.offset += buffer_layout.image_stride_B; + } +} + +void anv_CmdCopyBufferToImage2( + VkCommandBuffer commandBuffer, + const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer); + ANV_FROM_HANDLE(anv_image, dst_image, pCopyBufferToImageInfo->dstImage); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { + copy_buffer_to_image(cmd_buffer, &batch, src_buffer, dst_image, + pCopyBufferToImageInfo->dstImageLayout, + &pCopyBufferToImageInfo->pRegions[r], true); + } + + anv_blorp_batch_finish(&batch); +} + +void anv_CmdCopyImageToBuffer2( + VkCommandBuffer commandBuffer, + const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, src_image, pCopyImageToBufferInfo->srcImage); + ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pCopyImageToBufferInfo->regionCount; r++) { + copy_buffer_to_image(cmd_buffer, &batch, dst_buffer, src_image, + pCopyImageToBufferInfo->srcImageLayout, + &pCopyImageToBufferInfo->pRegions[r], false); + } + + anv_blorp_batch_finish(&batch); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES; +} + +static bool +flip_coords(unsigned *src0, unsigned *src1, unsigned *dst0, unsigned *dst1) +{ + bool flip = false; + if (*src0 > *src1) { + unsigned tmp = *src0; + *src0 = *src1; + *src1 = tmp; + flip = !flip; + } + + if (*dst0 > *dst1) { + unsigned tmp = *dst0; + *dst0 = *dst1; + *dst1 = tmp; + flip = !flip; + } + + return flip; +} + +static void +blit_image(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, + struct anv_image *src_image, + VkImageLayout src_image_layout, + struct anv_image *dst_image, + VkImageLayout dst_image_layout, + const VkImageBlit2 *region, + VkFilter filter) +{ + const VkImageSubresourceLayers *src_res = ®ion->srcSubresource; + const VkImageSubresourceLayers *dst_res = ®ion->dstSubresource; + + struct blorp_surf src, dst; + + enum blorp_filter blorp_filter; + switch (filter) { + case VK_FILTER_NEAREST: + blorp_filter = BLORP_FILTER_NEAREST; + break; + case VK_FILTER_LINEAR: + blorp_filter = BLORP_FILTER_BILINEAR; + break; + default: + unreachable("Invalid filter"); + } + + assert(anv_image_aspects_compatible(src_res->aspectMask, + dst_res->aspectMask)); + + anv_foreach_image_aspect_bit(aspect_bit, src_image, src_res->aspectMask) { + get_blorp_surf_for_anv_image(cmd_buffer->device, + src_image, 1U << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + src_image_layout, ISL_AUX_USAGE_NONE, &src); + get_blorp_surf_for_anv_image(cmd_buffer->device, + dst_image, 1U << aspect_bit, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + dst_image_layout, ISL_AUX_USAGE_NONE, &dst); + + struct anv_format_plane src_format = + anv_get_format_aspect(cmd_buffer->device->info, src_image->vk.format, + 1U << aspect_bit, src_image->vk.tiling); + struct anv_format_plane dst_format = + anv_get_format_aspect(cmd_buffer->device->info, dst_image->vk.format, + 1U << aspect_bit, dst_image->vk.tiling); + + unsigned dst_start, dst_end; + if (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) { + assert(dst_res->baseArrayLayer == 0); + dst_start = region->dstOffsets[0].z; + dst_end = region->dstOffsets[1].z; + } else { + dst_start = dst_res->baseArrayLayer; + dst_end = dst_start + + vk_image_subresource_layer_count(&dst_image->vk, dst_res); + } + + unsigned src_start, src_end; + if (src_image->vk.image_type == VK_IMAGE_TYPE_3D) { + assert(src_res->baseArrayLayer == 0); + src_start = region->srcOffsets[0].z; + src_end = region->srcOffsets[1].z; + } else { + src_start = src_res->baseArrayLayer; + src_end = src_start + + vk_image_subresource_layer_count(&src_image->vk, src_res); + } + + bool flip_z = flip_coords(&src_start, &src_end, &dst_start, &dst_end); + const unsigned num_layers = dst_end - dst_start; + float src_z_step = (float)(src_end - src_start) / (float)num_layers; + + /* There is no interpolation to the pixel center during rendering, so + * add the 0.5 offset ourselves here. */ + float depth_center_offset = 0; + if (src_image->vk.image_type == VK_IMAGE_TYPE_3D) + depth_center_offset = 0.5 / num_layers * (src_end - src_start); + + if (flip_z) { + src_start = src_end; + src_z_step *= -1; + depth_center_offset *= -1; + } + + unsigned src_x0 = region->srcOffsets[0].x; + unsigned src_x1 = region->srcOffsets[1].x; + unsigned dst_x0 = region->dstOffsets[0].x; + unsigned dst_x1 = region->dstOffsets[1].x; + bool flip_x = flip_coords(&src_x0, &src_x1, &dst_x0, &dst_x1); + + unsigned src_y0 = region->srcOffsets[0].y; + unsigned src_y1 = region->srcOffsets[1].y; + unsigned dst_y0 = region->dstOffsets[0].y; + unsigned dst_y1 = region->dstOffsets[1].y; + bool flip_y = flip_coords(&src_y0, &src_y1, &dst_y0, &dst_y1); + + anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, + 1U << aspect_bit, + dst.aux_usage, + dst_res->mipLevel, + dst_start, num_layers); + + for (unsigned i = 0; i < num_layers; i++) { + unsigned dst_z = dst_start + i; + float src_z = src_start + i * src_z_step + depth_center_offset; + + blorp_blit(batch, &src, src_res->mipLevel, src_z, + src_format.isl_format, src_format.swizzle, + &dst, dst_res->mipLevel, dst_z, + dst_format.isl_format, dst_format.swizzle, + src_x0, src_y0, src_x1, src_y1, + dst_x0, dst_y0, dst_x1, dst_y1, + blorp_filter, flip_x, flip_y); + } + } +} + +void anv_CmdBlitImage2( + VkCommandBuffer commandBuffer, + const VkBlitImageInfo2* pBlitImageInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, src_image, pBlitImageInfo->srcImage); + ANV_FROM_HANDLE(anv_image, dst_image, pBlitImageInfo->dstImage); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pBlitImageInfo->regionCount; r++) { + blit_image(cmd_buffer, &batch, + src_image, pBlitImageInfo->srcImageLayout, + dst_image, pBlitImageInfo->dstImageLayout, + &pBlitImageInfo->pRegions[r], pBlitImageInfo->filter); + } + + anv_blorp_batch_finish(&batch); +} + +/** + * Returns the greatest common divisor of a and b that is a power of two. + */ +static uint64_t +gcd_pow2_u64(uint64_t a, uint64_t b) +{ + assert(a > 0 || b > 0); + + unsigned a_log2 = ffsll(a) - 1; + unsigned b_log2 = ffsll(b) - 1; + + /* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which + * case, the MIN2() will take the other one. If both are 0 then we will + * hit the assert above. + */ + return 1 << MIN2(a_log2, b_log2); +} + +/* This is maximum possible width/height our HW can handle */ +#define MAX_SURFACE_DIM (1ull << 14) + +static void +copy_buffer(struct anv_device *device, + struct blorp_batch *batch, + struct anv_buffer *src_buffer, + struct anv_buffer *dst_buffer, + const VkBufferCopy2 *region) +{ + struct blorp_address src = { + .buffer = src_buffer->address.bo, + .offset = src_buffer->address.offset + region->srcOffset, + .mocs = anv_mocs(device, src_buffer->address.bo, + ISL_SURF_USAGE_TEXTURE_BIT), + }; + struct blorp_address dst = { + .buffer = dst_buffer->address.bo, + .offset = dst_buffer->address.offset + region->dstOffset, + .mocs = anv_mocs(device, dst_buffer->address.bo, + ISL_SURF_USAGE_RENDER_TARGET_BIT), + }; + + blorp_buffer_copy(batch, src, dst, region->size); +} + +void anv_CmdCopyBuffer2( + VkCommandBuffer commandBuffer, + const VkCopyBufferInfo2* pCopyBufferInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferInfo->srcBuffer); + ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) { + copy_buffer(cmd_buffer->device, &batch, src_buffer, dst_buffer, + &pCopyBufferInfo->pRegions[r]); + } + + anv_blorp_batch_finish(&batch); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES; +} + + +void anv_CmdUpdateBuffer( + VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize dataSize, + const void* pData) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + /* We can't quite grab a full block because the state stream needs a + * little data at the top to build its linked list. + */ + const uint32_t max_update_size = + cmd_buffer->device->dynamic_state_pool.block_size - 64; + + assert(max_update_size < MAX_SURFACE_DIM * 4); + + /* We're about to read data that was written from the CPU. Flush the + * texture cache so we don't get anything stale. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, + "before UpdateBuffer"); + + while (dataSize) { + const uint32_t copy_size = MIN2(dataSize, max_update_size); + + struct anv_state tmp_data = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, copy_size, 64); + + memcpy(tmp_data.map, pData, copy_size); + + struct blorp_address src = { + .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = tmp_data.offset, + .mocs = isl_mocs(&cmd_buffer->device->isl_dev, + ISL_SURF_USAGE_TEXTURE_BIT, false) + }; + struct blorp_address dst = { + .buffer = dst_buffer->address.bo, + .offset = dst_buffer->address.offset + dstOffset, + .mocs = anv_mocs(cmd_buffer->device, dst_buffer->address.bo, + ISL_SURF_USAGE_RENDER_TARGET_BIT), + }; + + blorp_buffer_copy(&batch, src, dst, copy_size); + + dataSize -= copy_size; + dstOffset += copy_size; + pData = (void *)pData + copy_size; + } + + anv_blorp_batch_finish(&batch); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES; +} + +void anv_CmdFillBuffer( + VkCommandBuffer commandBuffer, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + VkDeviceSize fillSize, + uint32_t data) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer); + struct blorp_surf surf; + struct isl_surf isl_surf; + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + fillSize = vk_buffer_range(&dst_buffer->vk, dstOffset, fillSize); + + /* From the Vulkan spec: + * + * "size is the number of bytes to fill, and must be either a multiple + * of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of + * the buffer. If VK_WHOLE_SIZE is used and the remaining size of the + * buffer is not a multiple of 4, then the nearest smaller multiple is + * used." + */ + fillSize &= ~3ull; + + /* First, we compute the biggest format that can be used with the + * given offsets and size. + */ + int bs = 16; + bs = gcd_pow2_u64(bs, dstOffset); + bs = gcd_pow2_u64(bs, fillSize); + enum isl_format isl_format = isl_format_for_size(bs); + + union isl_color_value color = { + .u32 = { data, data, data, data }, + }; + + const uint64_t max_fill_size = MAX_SURFACE_DIM * MAX_SURFACE_DIM * bs; + while (fillSize >= max_fill_size) { + get_blorp_surf_for_anv_buffer(cmd_buffer->device, + dst_buffer, dstOffset, + MAX_SURFACE_DIM, MAX_SURFACE_DIM, + MAX_SURFACE_DIM * bs, isl_format, true, + &surf, &isl_surf); + + blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY, + 0, 0, 1, 0, 0, MAX_SURFACE_DIM, MAX_SURFACE_DIM, + color, 0 /* color_write_disable */); + fillSize -= max_fill_size; + dstOffset += max_fill_size; + } + + uint64_t height = fillSize / (MAX_SURFACE_DIM * bs); + assert(height < MAX_SURFACE_DIM); + if (height != 0) { + const uint64_t rect_fill_size = height * MAX_SURFACE_DIM * bs; + get_blorp_surf_for_anv_buffer(cmd_buffer->device, + dst_buffer, dstOffset, + MAX_SURFACE_DIM, height, + MAX_SURFACE_DIM * bs, isl_format, true, + &surf, &isl_surf); + + blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY, + 0, 0, 1, 0, 0, MAX_SURFACE_DIM, height, + color, 0 /* color_write_disable */); + fillSize -= rect_fill_size; + dstOffset += rect_fill_size; + } + + if (fillSize != 0) { + const uint32_t width = fillSize / bs; + get_blorp_surf_for_anv_buffer(cmd_buffer->device, + dst_buffer, dstOffset, + width, 1, + width * bs, isl_format, true, + &surf, &isl_surf); + + blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY, + 0, 0, 1, 0, 0, width, 1, + color, 0 /* color_write_disable */); + } + + anv_blorp_batch_finish(&batch); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES; +} + +void anv_CmdClearColorImage( + VkCommandBuffer commandBuffer, + VkImage _image, + VkImageLayout imageLayout, + const VkClearColorValue* pColor, + uint32_t rangeCount, + const VkImageSubresourceRange* pRanges) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, image, _image); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + for (unsigned r = 0; r < rangeCount; r++) { + if (pRanges[r].aspectMask == 0) + continue; + + assert(pRanges[r].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, pRanges[r].aspectMask, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + imageLayout, ISL_AUX_USAGE_NONE, &surf); + + struct anv_format_plane src_format = + anv_get_format_aspect(cmd_buffer->device->info, image->vk.format, + VK_IMAGE_ASPECT_COLOR_BIT, image->vk.tiling); + + unsigned base_layer = pRanges[r].baseArrayLayer; + uint32_t layer_count = + vk_image_subresource_layer_count(&image->vk, &pRanges[r]); + uint32_t level_count = + vk_image_subresource_level_count(&image->vk, &pRanges[r]); + + for (uint32_t i = 0; i < level_count; i++) { + const unsigned level = pRanges[r].baseMipLevel + i; + const unsigned level_width = anv_minify(image->vk.extent.width, level); + const unsigned level_height = anv_minify(image->vk.extent.height, level); + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->vk.extent.depth, level); + } + + anv_cmd_buffer_mark_image_written(cmd_buffer, image, + pRanges[r].aspectMask, + surf.aux_usage, level, + base_layer, layer_count); + + blorp_clear(&batch, &surf, + src_format.isl_format, src_format.swizzle, + level, base_layer, layer_count, + 0, 0, level_width, level_height, + vk_to_isl_color(*pColor), 0 /* color_write_disable */); + } + } + + anv_blorp_batch_finish(&batch); +} + +void anv_CmdClearDepthStencilImage( + VkCommandBuffer commandBuffer, + VkImage image_h, + VkImageLayout imageLayout, + const VkClearDepthStencilValue* pDepthStencil, + uint32_t rangeCount, + const VkImageSubresourceRange* pRanges) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, image, image_h); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf depth, stencil, stencil_shadow; + if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + imageLayout, ISL_AUX_USAGE_NONE, &depth); + } else { + memset(&depth, 0, sizeof(depth)); + } + + bool has_stencil_shadow = false; + if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_STENCIL_BIT, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + imageLayout, ISL_AUX_USAGE_NONE, &stencil); + + has_stencil_shadow = + get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + &stencil_shadow); + } else { + memset(&stencil, 0, sizeof(stencil)); + } + + for (unsigned r = 0; r < rangeCount; r++) { + if (pRanges[r].aspectMask == 0) + continue; + + bool clear_depth = pRanges[r].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT; + bool clear_stencil = pRanges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT; + + unsigned base_layer = pRanges[r].baseArrayLayer; + uint32_t layer_count = + vk_image_subresource_layer_count(&image->vk, &pRanges[r]); + uint32_t level_count = + vk_image_subresource_level_count(&image->vk, &pRanges[r]); + + for (uint32_t i = 0; i < level_count; i++) { + const unsigned level = pRanges[r].baseMipLevel + i; + const unsigned level_width = anv_minify(image->vk.extent.width, level); + const unsigned level_height = anv_minify(image->vk.extent.height, level); + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) + layer_count = anv_minify(image->vk.extent.depth, level); + + blorp_clear_depth_stencil(&batch, &depth, &stencil, + level, base_layer, layer_count, + 0, 0, level_width, level_height, + clear_depth, pDepthStencil->depth, + clear_stencil ? 0xff : 0, + pDepthStencil->stencil); + + if (clear_stencil && has_stencil_shadow) { + union isl_color_value stencil_color = { + .u32 = { pDepthStencil->stencil, }, + }; + blorp_clear(&batch, &stencil_shadow, + ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY, + level, base_layer, layer_count, + 0, 0, level_width, level_height, + stencil_color, 0 /* color_write_disable */); + } + } + } + + anv_blorp_batch_finish(&batch); +} + +VkResult +anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer, + uint32_t num_entries, + uint32_t *state_offset, + struct anv_state *bt_state) +{ + *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, + state_offset); + if (bt_state->map == NULL) { + /* We ran out of space. Grab a new binding table block. */ + VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + if (result != VK_SUCCESS) + return result; + + /* Re-emit state base addresses so we get the new surface state base + * address before we start emitting binding tables etc. + */ + anv_cmd_buffer_emit_state_base_address(cmd_buffer); + + *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries, + state_offset); + assert(bt_state->map != NULL); + } + + return VK_SUCCESS; +} + +static VkResult +binding_table_for_surface_state(struct anv_cmd_buffer *cmd_buffer, + struct anv_state surface_state, + uint32_t *bt_offset) +{ + uint32_t state_offset; + struct anv_state bt_state; + + VkResult result = + anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, 1, &state_offset, + &bt_state); + if (result != VK_SUCCESS) + return result; + + uint32_t *bt_map = bt_state.map; + bt_map[0] = surface_state.offset + state_offset; + + *bt_offset = bt_state.offset; + return VK_SUCCESS; +} + +static void +clear_color_attachment(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, + const VkClearAttachment *attachment, + uint32_t rectCount, const VkClearRect *pRects) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const uint32_t att_idx = attachment->colorAttachment; + assert(att_idx < gfx->color_att_count); + const struct anv_attachment *att = &gfx->color_att[att_idx]; + + if (att->vk_format == VK_FORMAT_UNDEFINED) + return; + + uint32_t binding_table; + VkResult result = + binding_table_for_surface_state(cmd_buffer, att->surface_state.state, + &binding_table); + if (result != VK_SUCCESS) + return; + + union isl_color_value clear_color = + vk_to_isl_color(attachment->clearValue.color); + + /* If multiview is enabled we ignore baseArrayLayer and layerCount */ + if (gfx->view_mask) { + u_foreach_bit(view_idx, gfx->view_mask) { + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + blorp_clear_attachments(batch, binding_table, + ISL_FORMAT_UNSUPPORTED, + gfx->samples, + view_idx, 1, + offset.x, offset.y, + offset.x + extent.width, + offset.y + extent.height, + true, clear_color, false, 0.0f, 0, 0); + } + } + return; + } + + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS); + blorp_clear_attachments(batch, binding_table, + ISL_FORMAT_UNSUPPORTED, + gfx->samples, + pRects[r].baseArrayLayer, + pRects[r].layerCount, + offset.x, offset.y, + offset.x + extent.width, offset.y + extent.height, + true, clear_color, false, 0.0f, 0, 0); + } +} + +static void +clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer, + struct blorp_batch *batch, + const VkClearAttachment *attachment, + uint32_t rectCount, const VkClearRect *pRects) +{ + static const union isl_color_value color_value = { .u32 = { 0, } }; + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct anv_attachment *d_att = &gfx->depth_att; + const struct anv_attachment *s_att = &gfx->stencil_att; + if (d_att->vk_format == VK_FORMAT_UNDEFINED && + s_att->vk_format == VK_FORMAT_UNDEFINED) + return; + + bool clear_depth = attachment->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT; + bool clear_stencil = attachment->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT; + + enum isl_format depth_format = ISL_FORMAT_UNSUPPORTED; + if (d_att->vk_format != VK_FORMAT_UNDEFINED) { + depth_format = anv_get_isl_format(cmd_buffer->device->info, + d_att->vk_format, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_TILING_OPTIMAL); + } + + uint32_t binding_table; + VkResult result = + binding_table_for_surface_state(cmd_buffer, + gfx->null_surface_state, + &binding_table); + if (result != VK_SUCCESS) + return; + + /* If multiview is enabled we ignore baseArrayLayer and layerCount */ + if (gfx->view_mask) { + u_foreach_bit(view_idx, gfx->view_mask) { + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + VkClearDepthStencilValue value = attachment->clearValue.depthStencil; + blorp_clear_attachments(batch, binding_table, + depth_format, + gfx->samples, + view_idx, 1, + offset.x, offset.y, + offset.x + extent.width, + offset.y + extent.height, + false, color_value, + clear_depth, value.depth, + clear_stencil ? 0xff : 0, value.stencil); + } + } + return; + } + + for (uint32_t r = 0; r < rectCount; ++r) { + const VkOffset2D offset = pRects[r].rect.offset; + const VkExtent2D extent = pRects[r].rect.extent; + VkClearDepthStencilValue value = attachment->clearValue.depthStencil; + assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS); + blorp_clear_attachments(batch, binding_table, + depth_format, + gfx->samples, + pRects[r].baseArrayLayer, + pRects[r].layerCount, + offset.x, offset.y, + offset.x + extent.width, offset.y + extent.height, + false, color_value, + clear_depth, value.depth, + clear_stencil ? 0xff : 0, value.stencil); + } +} + +void anv_CmdClearAttachments( + VkCommandBuffer commandBuffer, + uint32_t attachmentCount, + const VkClearAttachment* pAttachments, + uint32_t rectCount, + const VkClearRect* pRects) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + /* Because this gets called within a render pass, we tell blorp not to + * trash our depth and stencil buffers. + */ + struct blorp_batch batch; + enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL; + if (cmd_buffer->state.conditional_render_enabled) { + anv_cmd_emit_conditional_render_predicate(cmd_buffer); + flags |= BLORP_BATCH_PREDICATE_ENABLE; + } + anv_blorp_batch_init(cmd_buffer, &batch, flags); + + for (uint32_t a = 0; a < attachmentCount; ++a) { + if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + assert(pAttachments[a].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + clear_color_attachment(cmd_buffer, &batch, + &pAttachments[a], + rectCount, pRects); + } else { + clear_depth_stencil_attachment(cmd_buffer, &batch, + &pAttachments[a], + rectCount, pRects); + } + } + + anv_blorp_batch_finish(&batch); +} + +enum subpass_stage { + SUBPASS_STAGE_LOAD, + SUBPASS_STAGE_DRAW, + SUBPASS_STAGE_RESOLVE, +}; + +void +anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *src_image, + enum isl_aux_usage src_aux_usage, + uint32_t src_level, uint32_t src_base_layer, + const struct anv_image *dst_image, + enum isl_aux_usage dst_aux_usage, + uint32_t dst_level, uint32_t dst_base_layer, + VkImageAspectFlagBits aspect, + uint32_t src_x, uint32_t src_y, + uint32_t dst_x, uint32_t dst_y, + uint32_t width, uint32_t height, + uint32_t layer_count, + enum blorp_filter filter) +{ + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + assert(src_image->vk.image_type == VK_IMAGE_TYPE_2D); + assert(src_image->vk.samples > 1); + assert(dst_image->vk.image_type == VK_IMAGE_TYPE_2D); + assert(dst_image->vk.samples == 1); + + struct blorp_surf src_surf, dst_surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, aspect, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + src_aux_usage, &src_surf); + if (src_aux_usage == ISL_AUX_USAGE_MCS) { + src_surf.clear_color_addr = anv_to_blorp_address( + anv_image_get_clear_color_addr(cmd_buffer->device, src_image, + VK_IMAGE_ASPECT_COLOR_BIT)); + } + get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, aspect, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + dst_aux_usage, &dst_surf); + anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, + aspect, dst_aux_usage, + dst_level, dst_base_layer, layer_count); + + if (filter == BLORP_FILTER_NONE) { + /* If no explicit filter is provided, then it's implied by the type of + * the source image. + */ + if ((src_surf.surf->usage & ISL_SURF_USAGE_DEPTH_BIT) || + (src_surf.surf->usage & ISL_SURF_USAGE_STENCIL_BIT) || + isl_format_has_int_channel(src_surf.surf->format)) { + filter = BLORP_FILTER_SAMPLE_0; + } else { + filter = BLORP_FILTER_AVERAGE; + } + } + + for (uint32_t l = 0; l < layer_count; l++) { + blorp_blit(&batch, + &src_surf, src_level, src_base_layer + l, + ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY, + &dst_surf, dst_level, dst_base_layer + l, + ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY, + src_x, src_y, src_x + width, src_y + height, + dst_x, dst_y, dst_x + width, dst_y + height, + filter, false, false); + } + + anv_blorp_batch_finish(&batch); +} + +static void +resolve_image(struct anv_cmd_buffer *cmd_buffer, + struct anv_image *src_image, + VkImageLayout src_image_layout, + struct anv_image *dst_image, + VkImageLayout dst_image_layout, + const VkImageResolve2 *region) +{ + assert(region->srcSubresource.aspectMask == region->dstSubresource.aspectMask); + assert(vk_image_subresource_layer_count(&src_image->vk, ®ion->srcSubresource) == + vk_image_subresource_layer_count(&dst_image->vk, ®ion->dstSubresource)); + + const uint32_t layer_count = + vk_image_subresource_layer_count(&dst_image->vk, ®ion->dstSubresource); + + anv_foreach_image_aspect_bit(aspect_bit, src_image, + region->srcSubresource.aspectMask) { + enum isl_aux_usage src_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, src_image, + (1 << aspect_bit), + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + src_image_layout); + enum isl_aux_usage dst_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, dst_image, + (1 << aspect_bit), + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + dst_image_layout); + + anv_image_msaa_resolve(cmd_buffer, + src_image, src_aux_usage, + region->srcSubresource.mipLevel, + region->srcSubresource.baseArrayLayer, + dst_image, dst_aux_usage, + region->dstSubresource.mipLevel, + region->dstSubresource.baseArrayLayer, + (1 << aspect_bit), + region->srcOffset.x, + region->srcOffset.y, + region->dstOffset.x, + region->dstOffset.y, + region->extent.width, + region->extent.height, + layer_count, BLORP_FILTER_NONE); + } +} + +void anv_CmdResolveImage2( + VkCommandBuffer commandBuffer, + const VkResolveImageInfo2* pResolveImageInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_image, src_image, pResolveImageInfo->srcImage); + ANV_FROM_HANDLE(anv_image, dst_image, pResolveImageInfo->dstImage); + + for (uint32_t r = 0; r < pResolveImageInfo->regionCount; r++) { + resolve_image(cmd_buffer, + src_image, pResolveImageInfo->srcImageLayout, + dst_image, pResolveImageInfo->dstImageLayout, + &pResolveImageInfo->pRegions[r]); + } +} + +void +anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) +{ + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + /* We don't know who touched the main surface last so flush a bunch of + * caches to ensure we get good data. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, + "before copy_to_shadow"); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, aspect, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + VK_IMAGE_LAYOUT_GENERAL, + ISL_AUX_USAGE_NONE, &surf); + assert(surf.aux_usage == ISL_AUX_USAGE_NONE); + + struct blorp_surf shadow_surf; + get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, + image, aspect, &shadow_surf); + + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + const VkExtent3D extent = vk_image_mip_level_extent(&image->vk, level); + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) + layer_count = extent.depth; + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + blorp_copy(&batch, &surf, level, layer, + &shadow_surf, level, layer, + 0, 0, 0, 0, extent.width, extent.height); + } + } + + /* We just wrote to the buffer with the render cache. Flush it. */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "after copy_to_shadow"); + + anv_blorp_batch_finish(&batch); +} + +void +anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + enum isl_format format, struct isl_swizzle swizzle, + uint32_t level, uint32_t base_layer, uint32_t layer_count, + VkRect2D area, union isl_color_value clear_color) +{ + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + + /* We don't support planar images with multisampling yet */ + assert(image->n_planes == 1); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + aux_usage, &surf); + anv_cmd_buffer_mark_image_written(cmd_buffer, image, aspect, aux_usage, + level, base_layer, layer_count); + + blorp_clear(&batch, &surf, format, anv_swizzle_for_render(swizzle), + level, base_layer, layer_count, + area.offset.x, area.offset.y, + area.offset.x + area.extent.width, + area.offset.y + area.extent.height, + clear_color, 0 /* color_write_disable */); + + anv_blorp_batch_finish(&batch); +} + +void +anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlags aspects, + enum isl_aux_usage depth_aux_usage, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + VkRect2D area, + float depth_value, uint8_t stencil_value) +{ + assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf depth = {}; + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + depth_aux_usage, &depth); + } + + struct blorp_surf stencil = {}; + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_STENCIL_BIT, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, &stencil); + } + + /* Blorp may choose to clear stencil using RGBA32_UINT for better + * performance. If it does this, we need to flush it out of the depth + * cache before rendering to it. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before clear DS"); + + blorp_clear_depth_stencil(&batch, &depth, &stencil, + level, base_layer, layer_count, + area.offset.x, area.offset.y, + area.offset.x + area.extent.width, + area.offset.y + area.extent.height, + aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + depth_value, + (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) ? 0xff : 0, + stencil_value); + + /* Blorp may choose to clear stencil using RGBA32_UINT for better + * performance. If it does this, we need to flush it out of the render + * cache before someone starts trying to do stencil on it. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after clear DS"); + + struct blorp_surf stencil_shadow; + if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) && + get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + &stencil_shadow)) { + union isl_color_value stencil_color = { + .u32 = { stencil_value }, + }; + blorp_clear(&batch, &stencil_shadow, + ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY, + level, base_layer, layer_count, + area.offset.x, area.offset.y, + area.offset.x + area.extent.width, + area.offset.y + area.extent.height, + stencil_color, 0 /* color_write_disable */); + } + + anv_blorp_batch_finish(&batch); +} + +void +anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, uint32_t level, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op hiz_op) +{ + assert(aspect == VK_IMAGE_ASPECT_DEPTH_BIT); + assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, level)); + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + assert(plane == 0); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, &surf); + + blorp_hiz_op(&batch, &surf, level, base_layer, layer_count, hiz_op); + + anv_blorp_batch_finish(&batch); +} + +void +anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlags aspects, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + VkRect2D area, uint8_t stencil_value) +{ + assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)); + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, 0); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf depth = {}; + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) { + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); + assert(base_layer + layer_count <= + anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level)); + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, &depth); + } + + struct blorp_surf stencil = {}; + if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) { + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + get_blorp_surf_for_anv_image(cmd_buffer->device, + image, VK_IMAGE_ASPECT_STENCIL_BIT, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, &stencil); + } + + /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear": + * + * "The following is required when performing a depth buffer clear with + * using the WM_STATE or 3DSTATE_WM: + * + * * If other rendering operations have preceded this clear, a + * PIPE_CONTROL with depth cache flush enabled, Depth Stall bit + * enabled must be issued before the rectangle primitive used for + * the depth buffer clear operation. + * * [...]" + * + * Even though the PRM only says that this is required if using 3DSTATE_WM + * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional + * hangs when doing a clear with WM_HZ_OP. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT, + "before clear hiz"); + + if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && + depth.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) { + /* From Bspec 47010 (Depth Buffer Clear): + * + * Since the fast clear cycles to CCS are not cached in TileCache, + * any previous depth buffer writes to overlapping pixels must be + * flushed out of TileCache before a succeeding Depth Buffer Clear. + * This restriction only applies to Depth Buffer with write-thru + * enabled, since fast clears to CCS only occur for write-thru mode. + * + * There may have been a write to this depth buffer. Flush it from the + * tile cache just in case. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_TILE_CACHE_FLUSH_BIT, + "before clear hiz_ccs_wt"); + } + + blorp_hiz_clear_depth_stencil(&batch, &depth, &stencil, + level, base_layer, layer_count, + area.offset.x, area.offset.y, + area.offset.x + area.extent.width, + area.offset.y + area.extent.height, + aspects & VK_IMAGE_ASPECT_DEPTH_BIT, + ANV_HZ_FC_VAL, + aspects & VK_IMAGE_ASPECT_STENCIL_BIT, + stencil_value); + + anv_blorp_batch_finish(&batch); + + /* From the SKL PRM, Depth Buffer Clear: + * + * "Depth Buffer Clear Workaround + * + * Depth buffer clear pass using any of the methods (WM_STATE, + * 3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL + * command with DEPTH_STALL bit and Depth FLUSH bits “set” before + * starting to render. DepthStall and DepthFlush are not needed between + * consecutive depth clear passes nor is it required if the depth-clear + * pass was done with “full_surf_clear” bit set in the + * 3DSTATE_WM_HZ_OP." + * + * Even though the PRM provides a bunch of conditions under which this is + * supposedly unnecessary, we choose to perform the flush unconditionally + * just to be safe. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT, + "after clear hiz"); +} + +void +anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op mcs_op, union isl_color_value *clear_value, + bool predicate) +{ + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(image->vk.samples > 1); + assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0)); + + /* Multisampling with multi-planar formats is not supported */ + assert(image->n_planes == 1); + + const struct intel_device_info *devinfo = cmd_buffer->device->info; + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, + BLORP_BATCH_PREDICATE_ENABLE * predicate + + BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + ISL_AUX_USAGE_MCS, &surf); + + /* Blorp will store the clear color for us if we provide the clear color + * address and we are doing a fast clear. So we save the clear value into + * the blorp surface. + */ + if (clear_value) + surf.clear_color = *clear_value; + + /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": + * + * "After Render target fast clear, pipe-control with color cache + * write-flush must be issued before sending any DRAW commands on + * that render target." + * + * This comment is a bit cryptic and doesn't really tell you what's going + * or what's really needed. It appears that fast clear ops are not + * properly synchronized with other drawing. This means that we cannot + * have a fast clear operation in the pipe at the same time as other + * regular drawing operations. We need to use a PIPE_CONTROL to ensure + * that the contents of the previous draw hit the render target before we + * resolve and then use a second PIPE_CONTROL after the resolve to ensure + * that it is completed before any additional drawing occurs. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_TILE_CACHE_FLUSH_BIT | + (devinfo->verx10 == 120 ? + ANV_PIPE_DEPTH_STALL_BIT : 0) | + (devinfo->verx10 == 125 ? + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) | + ANV_PIPE_PSS_STALL_SYNC_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before fast clear mcs"); + + switch (mcs_op) { + case ISL_AUX_OP_FAST_CLEAR: + blorp_fast_clear(&batch, &surf, format, swizzle, + 0, base_layer, layer_count, + 0, 0, image->vk.extent.width, image->vk.extent.height); + break; + case ISL_AUX_OP_PARTIAL_RESOLVE: + blorp_mcs_partial_resolve(&batch, &surf, format, + base_layer, layer_count); + break; + case ISL_AUX_OP_FULL_RESOLVE: + case ISL_AUX_OP_AMBIGUATE: + default: + unreachable("Unsupported MCS operation"); + } + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + (devinfo->verx10 == 120 ? + ANV_PIPE_TILE_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT : 0) | + ANV_PIPE_PSS_STALL_SYNC_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after fast clear mcs"); + + anv_blorp_batch_finish(&batch); +} + +void +anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, uint32_t level, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op ccs_op, union isl_color_value *clear_value, + bool predicate) +{ + assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + assert(image->vk.samples == 1); + assert(level < anv_image_aux_levels(image, aspect)); + /* Multi-LOD YcBcR is not allowed */ + assert(image->n_planes == 1 || level == 0); + assert(base_layer + layer_count <= + anv_image_aux_layers(image, aspect, level)); + + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + const struct intel_device_info *devinfo = cmd_buffer->device->info; + + struct blorp_batch batch; + anv_blorp_batch_init(cmd_buffer, &batch, + BLORP_BATCH_PREDICATE_ENABLE * predicate + + BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value); + assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct blorp_surf surf; + get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect, + 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX, + image->planes[plane].aux_usage, + &surf); + + uint32_t level_width = anv_minify(surf.surf->logical_level0_px.w, level); + uint32_t level_height = anv_minify(surf.surf->logical_level0_px.h, level); + + /* Blorp will store the clear color for us if we provide the clear color + * address and we are doing a fast clear. So we save the clear value into + * the blorp surface. + */ + if (clear_value) + surf.clear_color = *clear_value; + + /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear": + * + * "After Render target fast clear, pipe-control with color cache + * write-flush must be issued before sending any DRAW commands on + * that render target." + * + * This comment is a bit cryptic and doesn't really tell you what's going + * or what's really needed. It appears that fast clear ops are not + * properly synchronized with other drawing. This means that we cannot + * have a fast clear operation in the pipe at the same time as other + * regular drawing operations. We need to use a PIPE_CONTROL to ensure + * that the contents of the previous draw hit the render target before we + * resolve and then use a second PIPE_CONTROL after the resolve to ensure + * that it is completed before any additional drawing occurs. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_TILE_CACHE_FLUSH_BIT | + (devinfo->verx10 == 120 ? + ANV_PIPE_DEPTH_STALL_BIT : 0) | + (devinfo->verx10 == 125 ? + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) | + ANV_PIPE_PSS_STALL_SYNC_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before fast clear ccs"); + + switch (ccs_op) { + case ISL_AUX_OP_FAST_CLEAR: + blorp_fast_clear(&batch, &surf, format, swizzle, + level, base_layer, layer_count, + 0, 0, level_width, level_height); + break; + case ISL_AUX_OP_FULL_RESOLVE: + case ISL_AUX_OP_PARTIAL_RESOLVE: + blorp_ccs_resolve(&batch, &surf, level, base_layer, layer_count, + format, ccs_op); + break; + case ISL_AUX_OP_AMBIGUATE: + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + blorp_ccs_ambiguate(&batch, &surf, level, layer); + } + break; + default: + unreachable("Unsupported CCS operation"); + } + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + (devinfo->verx10 == 120 ? + ANV_PIPE_TILE_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT : 0) | + ANV_PIPE_PSS_STALL_SYNC_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after fast clear ccs"); + + anv_blorp_batch_finish(&batch); +} diff --git a/src/intel/vulkan_hasvk/anv_bo_sync.c b/src/intel/vulkan_hasvk/anv_bo_sync.c new file mode 100644 index 00000000000..149ae2c2ba2 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_bo_sync.c @@ -0,0 +1,237 @@ +/* + * Copyright © 2021 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include "util/os_time.h" + +static struct anv_bo_sync * +to_anv_bo_sync(struct vk_sync *sync) +{ + assert(sync->type == &anv_bo_sync_type); + return container_of(sync, struct anv_bo_sync, sync); +} + +static VkResult +anv_bo_sync_init(struct vk_device *vk_device, + struct vk_sync *vk_sync, + uint64_t initial_value) +{ + struct anv_device *device = container_of(vk_device, struct anv_device, vk); + struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync); + + sync->state = initial_value ? ANV_BO_SYNC_STATE_SIGNALED : + ANV_BO_SYNC_STATE_RESET; + + return anv_device_alloc_bo(device, "bo-sync", 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &sync->bo); +} + +static void +anv_bo_sync_finish(struct vk_device *vk_device, + struct vk_sync *vk_sync) +{ + struct anv_device *device = container_of(vk_device, struct anv_device, vk); + struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync); + + anv_device_release_bo(device, sync->bo); +} + +static VkResult +anv_bo_sync_reset(struct vk_device *vk_device, + struct vk_sync *vk_sync) +{ + struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync); + + sync->state = ANV_BO_SYNC_STATE_RESET; + + return VK_SUCCESS; +} + +static int64_t +anv_get_relative_timeout(uint64_t abs_timeout) +{ + uint64_t now = os_time_get_nano(); + + /* We don't want negative timeouts. + * + * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is + * supposed to block indefinitely timeouts < 0. Unfortunately, + * this was broken for a couple of kernel releases. Since there's + * no way to know whether or not the kernel we're using is one of + * the broken ones, the best we can do is to clamp the timeout to + * INT64_MAX. This limits the maximum timeout from 584 years to + * 292 years - likely not a big deal. + */ + if (abs_timeout < now) + return 0; + + uint64_t rel_timeout = abs_timeout - now; + if (rel_timeout > (uint64_t) INT64_MAX) + rel_timeout = INT64_MAX; + + return rel_timeout; +} + +static VkResult +anv_bo_sync_wait(struct vk_device *vk_device, + uint32_t wait_count, + const struct vk_sync_wait *waits, + enum vk_sync_wait_flags wait_flags, + uint64_t abs_timeout_ns) +{ + struct anv_device *device = container_of(vk_device, struct anv_device, vk); + VkResult result; + + uint32_t pending = wait_count; + while (pending) { + pending = 0; + bool signaled = false; + for (uint32_t i = 0; i < wait_count; i++) { + struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync); + switch (sync->state) { + case ANV_BO_SYNC_STATE_RESET: + /* This fence hasn't been submitted yet, we'll catch it the next + * time around. Yes, this may mean we dead-loop but, short of + * lots of locking and a condition variable, there's not much that + * we can do about that. + */ + assert(!(wait_flags & VK_SYNC_WAIT_PENDING)); + pending++; + continue; + + case ANV_BO_SYNC_STATE_SIGNALED: + /* This fence is not pending. If waitAll isn't set, we can return + * early. Otherwise, we have to keep going. + */ + if (wait_flags & VK_SYNC_WAIT_ANY) + return VK_SUCCESS; + continue; + + case ANV_BO_SYNC_STATE_SUBMITTED: + /* These are the fences we really care about. Go ahead and wait + * on it until we hit a timeout. + */ + if (!(wait_flags & VK_SYNC_WAIT_PENDING)) { + uint64_t rel_timeout = anv_get_relative_timeout(abs_timeout_ns); + result = anv_device_wait(device, sync->bo, rel_timeout); + /* This also covers VK_TIMEOUT */ + if (result != VK_SUCCESS) + return result; + + sync->state = ANV_BO_SYNC_STATE_SIGNALED; + signaled = true; + } + if (wait_flags & VK_SYNC_WAIT_ANY) + return VK_SUCCESS; + break; + + default: + unreachable("Invalid BO sync state"); + } + } + + if (pending && !signaled) { + /* If we've hit this then someone decided to vkWaitForFences before + * they've actually submitted any of them to a queue. This is a + * fairly pessimal case, so it's ok to lock here and use a standard + * pthreads condition variable. + */ + pthread_mutex_lock(&device->mutex); + + /* It's possible that some of the fences have changed state since the + * last time we checked. Now that we have the lock, check for + * pending fences again and don't wait if it's changed. + */ + uint32_t now_pending = 0; + for (uint32_t i = 0; i < wait_count; i++) { + struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync); + if (sync->state == ANV_BO_SYNC_STATE_RESET) + now_pending++; + } + assert(now_pending <= pending); + + if (now_pending == pending) { + struct timespec abstime = { + .tv_sec = abs_timeout_ns / NSEC_PER_SEC, + .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, + }; + + ASSERTED int ret; + ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); + assert(ret != EINVAL); + if (os_time_get_nano() >= abs_timeout_ns) { + pthread_mutex_unlock(&device->mutex); + return VK_TIMEOUT; + } + } + + pthread_mutex_unlock(&device->mutex); + } + } + + return VK_SUCCESS; +} + +const struct vk_sync_type anv_bo_sync_type = { + .size = sizeof(struct anv_bo_sync), + .features = VK_SYNC_FEATURE_BINARY | + VK_SYNC_FEATURE_GPU_WAIT | + VK_SYNC_FEATURE_GPU_MULTI_WAIT | + VK_SYNC_FEATURE_CPU_WAIT | + VK_SYNC_FEATURE_CPU_RESET | + VK_SYNC_FEATURE_WAIT_ANY | + VK_SYNC_FEATURE_WAIT_PENDING, + .init = anv_bo_sync_init, + .finish = anv_bo_sync_finish, + .reset = anv_bo_sync_reset, + .wait_many = anv_bo_sync_wait, +}; + +VkResult +anv_create_sync_for_memory(struct vk_device *device, + VkDeviceMemory memory, + bool signal_memory, + struct vk_sync **sync_out) +{ + ANV_FROM_HANDLE(anv_device_memory, mem, memory); + struct anv_bo_sync *bo_sync; + + bo_sync = vk_zalloc(&device->alloc, sizeof(*bo_sync), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (bo_sync == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + bo_sync->sync.type = &anv_bo_sync_type; + bo_sync->state = signal_memory ? ANV_BO_SYNC_STATE_RESET : + ANV_BO_SYNC_STATE_SUBMITTED; + bo_sync->bo = anv_bo_ref(mem->bo); + + *sync_out = &bo_sync->sync; + + return VK_SUCCESS; +} diff --git a/src/intel/vulkan_hasvk/anv_cmd_buffer.c b/src/intel/vulkan_hasvk/anv_cmd_buffer.c new file mode 100644 index 00000000000..0950bad52a6 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_cmd_buffer.c @@ -0,0 +1,1112 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" +#include "anv_measure.h" + +#include "vk_util.h" + +/** \file anv_cmd_buffer.c + * + * This file contains all of the stuff for emitting commands into a command + * buffer. This includes implementations of most of the vkCmd* + * entrypoints. This file is concerned entirely with state emission and + * not with the command buffer data structure itself. As far as this file + * is concerned, most of anv_cmd_buffer is magic. + */ + +static void +anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_state *state = &cmd_buffer->state; + + memset(state, 0, sizeof(*state)); + + state->current_pipeline = UINT32_MAX; + state->gfx.restart_index = UINT32_MAX; + state->gfx.dirty = 0; +} + +static void +anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipe_state) +{ + for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) { + if (pipe_state->push_descriptors[i]) { + anv_descriptor_set_layout_unref(cmd_buffer->device, + pipe_state->push_descriptors[i]->set.layout); + vk_free(&cmd_buffer->vk.pool->alloc, pipe_state->push_descriptors[i]); + } + } +} + +static void +anv_cmd_state_finish(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_state *state = &cmd_buffer->state; + + anv_cmd_pipeline_state_finish(cmd_buffer, &state->gfx.base); + anv_cmd_pipeline_state_finish(cmd_buffer, &state->compute.base); +} + +static void +anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer) +{ + anv_cmd_state_finish(cmd_buffer); + anv_cmd_state_init(cmd_buffer); +} + +static void anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer); + +static const struct vk_command_buffer_ops cmd_buffer_ops = { + .destroy = anv_cmd_buffer_destroy, +}; + +static VkResult anv_create_cmd_buffer( + struct anv_device * device, + struct vk_command_pool * pool, + VkCommandBufferLevel level, + VkCommandBuffer* pCommandBuffer) +{ + struct anv_cmd_buffer *cmd_buffer; + VkResult result; + + cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (cmd_buffer == NULL) + return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = vk_command_buffer_init(pool, &cmd_buffer->vk, + &cmd_buffer_ops, level); + if (result != VK_SUCCESS) + goto fail_alloc; + + cmd_buffer->vk.dynamic_graphics_state.ms.sample_locations = + &cmd_buffer->state.gfx.sample_locations; + + cmd_buffer->batch.status = VK_SUCCESS; + + cmd_buffer->device = device; + + assert(pool->queue_family_index < device->physical->queue.family_count); + cmd_buffer->queue_family = + &device->physical->queue.families[pool->queue_family_index]; + + result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer); + if (result != VK_SUCCESS) + goto fail_vk; + + anv_state_stream_init(&cmd_buffer->surface_state_stream, + &device->surface_state_pool, 4096); + anv_state_stream_init(&cmd_buffer->dynamic_state_stream, + &device->dynamic_state_pool, 16384); + anv_state_stream_init(&cmd_buffer->general_state_stream, + &device->general_state_pool, 16384); + + cmd_buffer->self_mod_locations = NULL; + + anv_cmd_state_init(cmd_buffer); + + anv_measure_init(cmd_buffer); + + u_trace_init(&cmd_buffer->trace, &device->ds.trace_context); + + *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer); + + return VK_SUCCESS; + + fail_vk: + vk_command_buffer_finish(&cmd_buffer->vk); + fail_alloc: + vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer); + + return result; +} + +VkResult anv_AllocateCommandBuffers( + VkDevice _device, + const VkCommandBufferAllocateInfo* pAllocateInfo, + VkCommandBuffer* pCommandBuffers) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool); + + VkResult result = VK_SUCCESS; + uint32_t i; + + for (i = 0; i < pAllocateInfo->commandBufferCount; i++) { + result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level, + &pCommandBuffers[i]); + if (result != VK_SUCCESS) + break; + } + + if (result != VK_SUCCESS) { + while (i--) { + VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]); + anv_cmd_buffer_destroy(cmd_buffer); + } + for (i = 0; i < pAllocateInfo->commandBufferCount; i++) + pCommandBuffers[i] = VK_NULL_HANDLE; + } + + return result; +} + +static void +anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer) +{ + struct anv_cmd_buffer *cmd_buffer = + container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk); + + u_trace_fini(&cmd_buffer->trace); + + anv_measure_destroy(cmd_buffer); + + anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer); + + anv_state_stream_finish(&cmd_buffer->surface_state_stream); + anv_state_stream_finish(&cmd_buffer->dynamic_state_stream); + anv_state_stream_finish(&cmd_buffer->general_state_stream); + + anv_cmd_state_finish(cmd_buffer); + + vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->self_mod_locations); + + vk_command_buffer_finish(&cmd_buffer->vk); + vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer); +} + +VkResult +anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer) +{ + vk_command_buffer_reset(&cmd_buffer->vk); + + cmd_buffer->usage_flags = 0; + cmd_buffer->perf_query_pool = NULL; + anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer); + anv_cmd_state_reset(cmd_buffer); + + anv_state_stream_finish(&cmd_buffer->surface_state_stream); + anv_state_stream_init(&cmd_buffer->surface_state_stream, + &cmd_buffer->device->surface_state_pool, 4096); + + anv_state_stream_finish(&cmd_buffer->dynamic_state_stream); + anv_state_stream_init(&cmd_buffer->dynamic_state_stream, + &cmd_buffer->device->dynamic_state_pool, 16384); + + anv_state_stream_finish(&cmd_buffer->general_state_stream); + anv_state_stream_init(&cmd_buffer->general_state_stream, + &cmd_buffer->device->general_state_pool, 16384); + + anv_measure_reset(cmd_buffer); + + u_trace_fini(&cmd_buffer->trace); + u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->ds.trace_context); + + return VK_SUCCESS; +} + +VkResult anv_ResetCommandBuffer( + VkCommandBuffer commandBuffer, + VkCommandBufferResetFlags flags) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + return anv_cmd_buffer_reset(cmd_buffer); +} + +void +anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + anv_genX(devinfo, cmd_buffer_emit_state_base_address)(cmd_buffer); +} + +void +anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + uint32_t level, + uint32_t base_layer, + uint32_t layer_count) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + anv_genX(devinfo, cmd_buffer_mark_image_written)(cmd_buffer, image, + aspect, aux_usage, + level, base_layer, + layer_count); +} + +void +anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + anv_genX(devinfo, cmd_emit_conditional_render_predicate)(cmd_buffer); +} + +static bool +mem_update(void *dst, const void *src, size_t size) +{ + if (memcmp(dst, src, size) == 0) + return false; + + memcpy(dst, src, size); + return true; +} + +static void +set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + const struct anv_pipeline_bind_map *map) +{ + assert(stage < ARRAY_SIZE(cmd_buffer->state.surface_sha1s)); + if (mem_update(cmd_buffer->state.surface_sha1s[stage], + map->surface_sha1, sizeof(map->surface_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + assert(stage < ARRAY_SIZE(cmd_buffer->state.sampler_sha1s)); + if (mem_update(cmd_buffer->state.sampler_sha1s[stage], + map->sampler_sha1, sizeof(map->sampler_sha1))) + cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage); + + assert(stage < ARRAY_SIZE(cmd_buffer->state.push_sha1s)); + if (mem_update(cmd_buffer->state.push_sha1s[stage], + map->push_sha1, sizeof(map->push_sha1))) + cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); +} + +static inline uint32_t +ilog2_round_up(uint32_t value) +{ + assert(value != 0); + return 32 - __builtin_clz(value - 1); +} + +static void +anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipeline_state, + struct anv_pipeline *pipeline, + VkShaderStageFlags stages) +{ + struct anv_device *device = cmd_buffer->device; + + uint64_t ray_shadow_size = + align_u64(brw_rt_ray_queries_shadow_stacks_size(device->info, + pipeline->ray_queries), + 4096); + if (ray_shadow_size > 0 && + (!cmd_buffer->state.ray_query_shadow_bo || + cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) { + unsigned shadow_size_log2 = MAX2(ilog2_round_up(ray_shadow_size), 16); + unsigned bucket = shadow_size_log2 - 16; + assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos)); + + struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]); + if (bo == NULL) { + struct anv_bo *new_bo; + VkResult result = anv_device_alloc_bo(device, "RT queries shadow", + ray_shadow_size, + 0, /* alloc_flags */ + 0, /* explicit_address */ + &new_bo); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } + + bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo); + if (bo != NULL) { + anv_device_release_bo(device, bo); + } else { + bo = new_bo; + } + } + cmd_buffer->state.ray_query_shadow_bo = bo; + + /* Add the ray query buffers to the batch list. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + cmd_buffer->state.ray_query_shadow_bo); + } + + /* Add the HW buffer to the list of BO used. */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + device->ray_query_bo); + + /* Fill the push constants & mark them dirty. */ + struct anv_state ray_query_global_state = + anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer); + + struct anv_address ray_query_globals_addr = (struct anv_address) { + .bo = device->dynamic_state_pool.block_pool.bo, + .offset = ray_query_global_state.offset, + }; + pipeline_state->push_constants.ray_query_globals = + anv_address_physical(ray_query_globals_addr); + cmd_buffer->state.push_constants_dirty |= stages; +} + +void anv_CmdBindPipeline( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipeline _pipeline) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + struct anv_cmd_pipeline_state *state; + VkShaderStageFlags stages = 0; + + switch (pipelineBindPoint) { + case VK_PIPELINE_BIND_POINT_COMPUTE: { + struct anv_compute_pipeline *compute_pipeline = + anv_pipeline_to_compute(pipeline); + if (cmd_buffer->state.compute.pipeline == compute_pipeline) + return; + + cmd_buffer->state.compute.pipeline = compute_pipeline; + cmd_buffer->state.compute.pipeline_dirty = true; + set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE, + &compute_pipeline->cs->bind_map); + + state = &cmd_buffer->state.compute.base; + stages = VK_SHADER_STAGE_COMPUTE_BIT; + break; + } + + case VK_PIPELINE_BIND_POINT_GRAPHICS: { + struct anv_graphics_pipeline *gfx_pipeline = + anv_pipeline_to_graphics(pipeline); + if (cmd_buffer->state.gfx.pipeline == gfx_pipeline) + return; + + cmd_buffer->state.gfx.pipeline = gfx_pipeline; + cmd_buffer->state.gfx.vb_dirty |= gfx_pipeline->vb_used; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; + + anv_foreach_stage(stage, gfx_pipeline->active_stages) { + set_dirty_for_bind_map(cmd_buffer, stage, + &gfx_pipeline->shaders[stage]->bind_map); + } + + /* Apply the non dynamic state from the pipeline */ + vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, + &gfx_pipeline->dynamic_state); + + state = &cmd_buffer->state.gfx.base; + stages = gfx_pipeline->active_stages; + break; + } + + case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: { + struct anv_ray_tracing_pipeline *rt_pipeline = + anv_pipeline_to_ray_tracing(pipeline); + if (cmd_buffer->state.rt.pipeline == rt_pipeline) + return; + + cmd_buffer->state.rt.pipeline = rt_pipeline; + cmd_buffer->state.rt.pipeline_dirty = true; + + if (rt_pipeline->stack_size > 0) { + anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer, + rt_pipeline->stack_size); + } + + state = &cmd_buffer->state.rt.base; + break; + } + + default: + unreachable("invalid bind point"); + break; + } + + if (pipeline->ray_queries > 0) + anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages); +} + +static void +anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point, + struct anv_pipeline_layout *layout, + uint32_t set_index, + struct anv_descriptor_set *set, + uint32_t *dynamic_offset_count, + const uint32_t **dynamic_offsets) +{ + /* Either we have no pool because it's a push descriptor or the pool is not + * host only : + * + * VUID-vkCmdBindDescriptorSets-pDescriptorSets-04616: + * + * "Each element of pDescriptorSets must not have been allocated from a + * VkDescriptorPool with the + * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_VALVE flag set" + */ + assert(!set->pool || !set->pool->host_only); + + struct anv_descriptor_set_layout *set_layout = + layout->set[set_index].layout; + + VkShaderStageFlags stages = set_layout->shader_stages; + struct anv_cmd_pipeline_state *pipe_state; + + switch (bind_point) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + stages &= VK_SHADER_STAGE_ALL_GRAPHICS | + (cmd_buffer->device->vk.enabled_extensions.NV_mesh_shader ? + (VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV) : 0); + pipe_state = &cmd_buffer->state.gfx.base; + break; + + case VK_PIPELINE_BIND_POINT_COMPUTE: + stages &= VK_SHADER_STAGE_COMPUTE_BIT; + pipe_state = &cmd_buffer->state.compute.base; + break; + + case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: + stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR; + pipe_state = &cmd_buffer->state.rt.base; + break; + + default: + unreachable("invalid bind point"); + } + + VkShaderStageFlags dirty_stages = 0; + /* If it's a push descriptor set, we have to flag things as dirty + * regardless of whether or not the CPU-side data structure changed as we + * may have edited in-place. + */ + if (pipe_state->descriptors[set_index] != set || + anv_descriptor_set_is_push(set)) { + pipe_state->descriptors[set_index] = set; + + /* Those stages don't have access to HW binding tables. + * This means that we have to upload the descriptor set + * as an 64-bit address in the push constants. + */ + bool update_desc_sets = stages & (VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV | + VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR); + + if (update_desc_sets) { + struct anv_push_constants *push = &pipe_state->push_constants; + + struct anv_address addr = anv_descriptor_set_address(set); + push->desc_sets[set_index] = anv_address_physical(addr); + + if (addr.bo) { + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + addr.bo); + } + } + + dirty_stages |= stages; + } + + if (dynamic_offsets) { + if (set_layout->dynamic_offset_count > 0) { + struct anv_push_constants *push = &pipe_state->push_constants; + uint32_t dynamic_offset_start = + layout->set[set_index].dynamic_offset_start; + uint32_t *push_offsets = + &push->dynamic_offsets[dynamic_offset_start]; + + /* Assert that everything is in range */ + assert(set_layout->dynamic_offset_count <= *dynamic_offset_count); + assert(dynamic_offset_start + set_layout->dynamic_offset_count <= + ARRAY_SIZE(push->dynamic_offsets)); + + for (uint32_t i = 0; i < set_layout->dynamic_offset_count; i++) { + if (push_offsets[i] != (*dynamic_offsets)[i]) { + push_offsets[i] = (*dynamic_offsets)[i]; + /* dynamic_offset_stages[] elements could contain blanket + * values like VK_SHADER_STAGE_ALL, so limit this to the + * binding point's bits. + */ + dirty_stages |= set_layout->dynamic_offset_stages[i] & stages; + } + } + + *dynamic_offsets += set_layout->dynamic_offset_count; + *dynamic_offset_count -= set_layout->dynamic_offset_count; + } + } + + cmd_buffer->state.descriptors_dirty |= dirty_stages; + cmd_buffer->state.push_constants_dirty |= dirty_stages; +} + +void anv_CmdBindDescriptorSets( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t firstSet, + uint32_t descriptorSetCount, + const VkDescriptorSet* pDescriptorSets, + uint32_t dynamicOffsetCount, + const uint32_t* pDynamicOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout); + + assert(firstSet + descriptorSetCount <= MAX_SETS); + + for (uint32_t i = 0; i < descriptorSetCount; i++) { + ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]); + anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint, + layout, firstSet + i, set, + &dynamicOffsetCount, + &pDynamicOffsets); + } +} + +void anv_CmdBindVertexBuffers2( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes, + const VkDeviceSize* pStrides) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_vertex_binding *vb = cmd_buffer->state.vertex_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount <= MAX_VBS); + for (uint32_t i = 0; i < bindingCount; i++) { + ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]); + + if (buffer == NULL) { + vb[firstBinding + i] = (struct anv_vertex_binding) { + .buffer = NULL, + }; + } else { + vb[firstBinding + i] = (struct anv_vertex_binding) { + .buffer = buffer, + .offset = pOffsets[i], + .size = vk_buffer_range(&buffer->vk, pOffsets[i], + pSizes ? pSizes[i] : VK_WHOLE_SIZE), + }; + } + cmd_buffer->state.gfx.vb_dirty |= 1 << (firstBinding + i); + } + + if (pStrides != NULL) { + vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk, firstBinding, + bindingCount, pStrides); + } +} + +void anv_CmdBindTransformFeedbackBuffersEXT( + VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer* pBuffers, + const VkDeviceSize* pOffsets, + const VkDeviceSize* pSizes) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_xfb_binding *xfb = cmd_buffer->state.xfb_bindings; + + /* We have to defer setting up vertex buffer since we need the buffer + * stride from the pipeline. */ + + assert(firstBinding + bindingCount <= MAX_XFB_BUFFERS); + for (uint32_t i = 0; i < bindingCount; i++) { + if (pBuffers[i] == VK_NULL_HANDLE) { + xfb[firstBinding + i].buffer = NULL; + } else { + ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]); + xfb[firstBinding + i].buffer = buffer; + xfb[firstBinding + i].offset = pOffsets[i]; + xfb[firstBinding + i].size = + vk_buffer_range(&buffer->vk, pOffsets[i], + pSizes ? pSizes[i] : VK_WHOLE_SIZE); + } + } +} + +enum isl_format +anv_isl_format_for_descriptor_type(const struct anv_device *device, + VkDescriptorType type) +{ + switch (type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + return device->physical->compiler->indirect_ubos_use_sampler ? + ISL_FORMAT_R32G32B32A32_FLOAT : ISL_FORMAT_RAW; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + return ISL_FORMAT_RAW; + + default: + unreachable("Invalid descriptor type"); + } +} + +struct anv_state +anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer, + const void *data, uint32_t size, uint32_t alignment) +{ + struct anv_state state; + + state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment); + memcpy(state.map, data, size); + + VG(VALGRIND_CHECK_MEM_IS_DEFINED(state.map, size)); + + return state; +} + +struct anv_state +anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer, + uint32_t *a, uint32_t *b, + uint32_t dwords, uint32_t alignment) +{ + struct anv_state state; + uint32_t *p; + + state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + dwords * 4, alignment); + p = state.map; + for (uint32_t i = 0; i < dwords; i++) + p[i] = a[i] | b[i]; + + VG(VALGRIND_CHECK_MEM_IS_DEFINED(p, dwords * 4)); + + return state; +} + +struct anv_state +anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_push_constants *data = + &cmd_buffer->state.gfx.base.push_constants; + + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + sizeof(struct anv_push_constants), + 32 /* bottom 5 bits MBZ */); + memcpy(state.map, data, sizeof(struct anv_push_constants)); + + return state; +} + +struct anv_state +anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) +{ + const struct intel_device_info *devinfo = cmd_buffer->device->info; + struct anv_push_constants *data = + &cmd_buffer->state.compute.base.push_constants; + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; + const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); + const struct anv_push_range *range = &pipeline->cs->bind_map.push_ranges[0]; + + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); + const unsigned total_push_constants_size = + brw_cs_push_const_total_size(cs_prog_data, dispatch.threads); + if (total_push_constants_size == 0) + return (struct anv_state) { .offset = 0 }; + + const unsigned push_constant_alignment = + cmd_buffer->device->info->ver < 8 ? 32 : 64; + const unsigned aligned_total_push_constants_size = + ALIGN(total_push_constants_size, push_constant_alignment); + struct anv_state state; + if (devinfo->verx10 >= 125) { + state = anv_state_stream_alloc(&cmd_buffer->general_state_stream, + aligned_total_push_constants_size, + push_constant_alignment); + } else { + state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + aligned_total_push_constants_size, + push_constant_alignment); + } + + void *dst = state.map; + const void *src = (char *)data + (range->start * 32); + + if (cs_prog_data->push.cross_thread.size > 0) { + memcpy(dst, src, cs_prog_data->push.cross_thread.size); + dst += cs_prog_data->push.cross_thread.size; + src += cs_prog_data->push.cross_thread.size; + } + + if (cs_prog_data->push.per_thread.size > 0) { + for (unsigned t = 0; t < dispatch.threads; t++) { + memcpy(dst, src, cs_prog_data->push.per_thread.size); + + uint32_t *subgroup_id = dst + + offsetof(struct anv_push_constants, cs.subgroup_id) - + (range->start * 32 + cs_prog_data->push.cross_thread.size); + *subgroup_id = t; + + dst += cs_prog_data->push.per_thread.size; + } + } + + return state; +} + +void anv_CmdPushConstants( + VkCommandBuffer commandBuffer, + VkPipelineLayout layout, + VkShaderStageFlags stageFlags, + uint32_t offset, + uint32_t size, + const void* pValues) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (stageFlags & (VK_SHADER_STAGE_ALL_GRAPHICS | + VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV)) { + struct anv_cmd_pipeline_state *pipe_state = + &cmd_buffer->state.gfx.base; + + memcpy(pipe_state->push_constants.client_data + offset, pValues, size); + } + if (stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { + struct anv_cmd_pipeline_state *pipe_state = + &cmd_buffer->state.compute.base; + + memcpy(pipe_state->push_constants.client_data + offset, pValues, size); + } + if (stageFlags & (VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR)) { + struct anv_cmd_pipeline_state *pipe_state = + &cmd_buffer->state.rt.base; + + memcpy(pipe_state->push_constants.client_data + offset, pValues, size); + } + + cmd_buffer->state.push_constants_dirty |= stageFlags; +} + +static struct anv_descriptor_set * +anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point, + struct anv_descriptor_set_layout *layout, + uint32_t _set) +{ + struct anv_cmd_pipeline_state *pipe_state; + + switch (bind_point) { + case VK_PIPELINE_BIND_POINT_GRAPHICS: + pipe_state = &cmd_buffer->state.gfx.base; + break; + + case VK_PIPELINE_BIND_POINT_COMPUTE: + pipe_state = &cmd_buffer->state.compute.base; + break; + + case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: + pipe_state = &cmd_buffer->state.rt.base; + break; + + default: + unreachable("invalid bind point"); + } + + struct anv_push_descriptor_set **push_set = + &pipe_state->push_descriptors[_set]; + + if (*push_set == NULL) { + *push_set = vk_zalloc(&cmd_buffer->vk.pool->alloc, + sizeof(struct anv_push_descriptor_set), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (*push_set == NULL) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); + return NULL; + } + } + + struct anv_descriptor_set *set = &(*push_set)->set; + + if (set->layout != layout) { + if (set->layout) + anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout); + anv_descriptor_set_layout_ref(layout); + set->layout = layout; + } + set->size = anv_descriptor_set_layout_size(layout, 0); + set->buffer_view_count = layout->buffer_view_count; + set->descriptor_count = layout->descriptor_count; + set->buffer_views = (*push_set)->buffer_views; + + if (layout->descriptor_buffer_size && + ((*push_set)->set_used_on_gpu || + set->desc_mem.alloc_size < layout->descriptor_buffer_size)) { + /* The previous buffer is either actively used by some GPU command (so + * we can't modify it) or is too small. Allocate a new one. + */ + struct anv_state desc_mem = + anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, + anv_descriptor_set_layout_descriptor_buffer_size(layout, 0), + ANV_UBO_ALIGNMENT); + if (set->desc_mem.alloc_size) { + /* TODO: Do we really need to copy all the time? */ + memcpy(desc_mem.map, set->desc_mem.map, + MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size)); + } + set->desc_mem = desc_mem; + + set->desc_addr = (struct anv_address) { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + + enum isl_format format = + anv_isl_format_for_descriptor_type(cmd_buffer->device, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + set->desc_surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); + anv_fill_buffer_surface_state(cmd_buffer->device, + set->desc_surface_state, + format, ISL_SWIZZLE_IDENTITY, + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, + set->desc_addr, + layout->descriptor_buffer_size, 1); + } + + return set; +} + +void anv_CmdPushDescriptorSetKHR( + VkCommandBuffer commandBuffer, + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout _layout, + uint32_t _set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout); + + assert(_set < MAX_SETS); + + struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout; + + struct anv_descriptor_set *set = + anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint, + set_layout, _set); + if (!set) + return; + + /* Go through the user supplied descriptors. */ + for (uint32_t i = 0; i < descriptorWriteCount; i++) { + const VkWriteDescriptorSet *write = &pDescriptorWrites[i]; + + switch (write->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + anv_descriptor_set_write_image_view(cmd_buffer->device, set, + write->pImageInfo + j, + write->descriptorType, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_buffer_view, bview, + write->pTexelBufferView[j]); + + anv_descriptor_set_write_buffer_view(cmd_buffer->device, set, + write->descriptorType, + bview, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer); + + anv_descriptor_set_write_buffer(cmd_buffer->device, set, + &cmd_buffer->surface_state_stream, + write->descriptorType, + buffer, + write->dstBinding, + write->dstArrayElement + j, + write->pBufferInfo[j].offset, + write->pBufferInfo[j].range); + } + break; + + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + const VkWriteDescriptorSetAccelerationStructureKHR *accel_write = + vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR); + assert(accel_write->accelerationStructureCount == + write->descriptorCount); + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_acceleration_structure, accel, + accel_write->pAccelerationStructures[j]); + anv_descriptor_set_write_acceleration_structure(cmd_buffer->device, + set, accel, + write->dstBinding, + write->dstArrayElement + j); + } + break; + } + + default: + break; + } + } + + anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint, + layout, _set, set, NULL, NULL); +} + +void anv_CmdPushDescriptorSetWithTemplateKHR( + VkCommandBuffer commandBuffer, + VkDescriptorUpdateTemplate descriptorUpdateTemplate, + VkPipelineLayout _layout, + uint32_t _set, + const void* pData) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_descriptor_update_template, template, + descriptorUpdateTemplate); + ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout); + + assert(_set < MAX_PUSH_DESCRIPTORS); + + struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout; + + struct anv_descriptor_set *set = + anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point, + set_layout, _set); + if (!set) + return; + + anv_descriptor_set_write_template(cmd_buffer->device, set, + &cmd_buffer->surface_state_stream, + template, + pData); + + anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point, + layout, _set, set, NULL, NULL); +} + +void anv_CmdSetDeviceMask( + VkCommandBuffer commandBuffer, + uint32_t deviceMask) +{ + /* No-op */ +} + +void anv_CmdSetRayTracingPipelineStackSizeKHR( + VkCommandBuffer commandBuffer, + uint32_t pipelineStackSize) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; + struct anv_device *device = cmd_buffer->device; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + uint32_t stack_ids_per_dss = 2048; /* TODO */ + + unsigned stack_size_log2 = ilog2_round_up(pipelineStackSize); + if (stack_size_log2 < 10) + stack_size_log2 = 10; + + if (rt->scratch.layout.total_size == 1 << stack_size_log2) + return; + + brw_rt_compute_scratch_layout(&rt->scratch.layout, device->info, + stack_ids_per_dss, 1 << stack_size_log2); + + unsigned bucket = stack_size_log2 - 10; + assert(bucket < ARRAY_SIZE(device->rt_scratch_bos)); + + struct anv_bo *bo = p_atomic_read(&device->rt_scratch_bos[bucket]); + if (bo == NULL) { + struct anv_bo *new_bo; + VkResult result = anv_device_alloc_bo(device, "RT scratch", + rt->scratch.layout.total_size, + 0, /* alloc_flags */ + 0, /* explicit_address */ + &new_bo); + if (result != VK_SUCCESS) { + rt->scratch.layout.total_size = 0; + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } + + bo = p_atomic_cmpxchg(&device->rt_scratch_bos[bucket], NULL, new_bo); + if (bo != NULL) { + anv_device_release_bo(device, bo); + } else { + bo = new_bo; + } + } + + rt->scratch.bo = bo; +} diff --git a/src/intel/vulkan_hasvk/anv_descriptor_set.c b/src/intel/vulkan_hasvk/anv_descriptor_set.c new file mode 100644 index 00000000000..c8fe93a9fbd --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_descriptor_set.c @@ -0,0 +1,2046 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "util/mesa-sha1.h" +#include "vk_util.h" + +#include "anv_private.h" + +/* + * Descriptor set layouts. + */ + +static enum anv_descriptor_data +anv_descriptor_data_for_type(const struct anv_physical_device *device, + VkDescriptorType type) +{ + enum anv_descriptor_data data = 0; + + switch (type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + data = ANV_DESCRIPTOR_SAMPLER_STATE; + if (device->has_bindless_samplers) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; + break; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + data = ANV_DESCRIPTOR_SURFACE_STATE | + ANV_DESCRIPTOR_SAMPLER_STATE; + if (device->has_bindless_images || device->has_bindless_samplers) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; + break; + + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + data = ANV_DESCRIPTOR_SURFACE_STATE; + if (device->has_bindless_images) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; + break; + + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + data = ANV_DESCRIPTOR_SURFACE_STATE; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + data = ANV_DESCRIPTOR_SURFACE_STATE; + if (device->info.ver < 9) + data |= ANV_DESCRIPTOR_IMAGE_PARAM; + if (device->has_bindless_images) + data |= ANV_DESCRIPTOR_STORAGE_IMAGE; + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + data = ANV_DESCRIPTOR_SURFACE_STATE | + ANV_DESCRIPTOR_BUFFER_VIEW; + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + data = ANV_DESCRIPTOR_SURFACE_STATE; + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + data = ANV_DESCRIPTOR_INLINE_UNIFORM; + break; + + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: + data = ANV_DESCRIPTOR_ADDRESS_RANGE; + break; + + default: + unreachable("Unsupported descriptor type"); + } + + /* On gfx8 and above when we have softpin enabled, we also need to push + * SSBO address ranges so that we can use A64 messages in the shader. + */ + if (device->has_a64_buffer_access && + (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER || + type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC || + type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)) + data |= ANV_DESCRIPTOR_ADDRESS_RANGE; + + /* On Ivy Bridge and Bay Trail, we need swizzles textures in the shader + * Do not handle VK_DESCRIPTOR_TYPE_STORAGE_IMAGE and + * VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT because they already must + * have identity swizzle. + * + * TODO: We need to handle swizzle on buffer views too for those same + * platforms. + */ + if (device->info.verx10 == 70 && + (type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE || + type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)) + data |= ANV_DESCRIPTOR_TEXTURE_SWIZZLE; + + return data; +} + +static enum anv_descriptor_data +anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device, + const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info, + int binding) +{ + enum anv_descriptor_data desc_data = 0; + + if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0) { + for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) { + if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC || + i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + continue; + + desc_data |= anv_descriptor_data_for_type(device, i); + } + + desc_data |= anv_descriptor_data_for_type( + device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); + + return desc_data; + } + + const VkMutableDescriptorTypeListVALVE *type_list = + &mutable_info->pMutableDescriptorTypeLists[binding]; + for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) { + desc_data |= + anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]); + } + + return desc_data; +} + +static unsigned +anv_descriptor_data_size(enum anv_descriptor_data data) +{ + unsigned size = 0; + + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) + size += sizeof(struct anv_sampled_image_descriptor); + + if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) + size += sizeof(struct anv_storage_image_descriptor); + + if (data & ANV_DESCRIPTOR_IMAGE_PARAM) + size += BRW_IMAGE_PARAM_SIZE * 4; + + if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) + size += sizeof(struct anv_address_range_descriptor); + + if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) + size += sizeof(struct anv_texture_swizzle_descriptor); + + return size; +} + +static bool +anv_needs_descriptor_buffer(VkDescriptorType desc_type, + enum anv_descriptor_data desc_data) +{ + if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK || + anv_descriptor_data_size(desc_data) > 0) + return true; + return false; +} + +/** Returns the size in bytes of each descriptor with the given layout */ +static unsigned +anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout) +{ + if (layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) { + assert(layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM); + return layout->array_size; + } + + unsigned size = anv_descriptor_data_size(layout->data); + + /* For multi-planar bindings, we make every descriptor consume the maximum + * number of planes so we don't have to bother with walking arrays and + * adding things up every time. Fortunately, YCbCr samplers aren't all + * that common and likely won't be in the middle of big arrays. + */ + if (layout->max_plane_count > 1) + size *= layout->max_plane_count; + + return size; +} + +/** Returns size in bytes of the biggest descriptor in the given layout */ +static unsigned +anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device, + const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info, + int binding) +{ + unsigned size = 0; + + if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0) { + for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) { + + if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC || + i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + continue; + + enum anv_descriptor_data desc_data = + anv_descriptor_data_for_type(device, i); + size = MAX2(size, anv_descriptor_data_size(desc_data)); + } + + enum anv_descriptor_data desc_data = anv_descriptor_data_for_type( + device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR); + size = MAX2(size, anv_descriptor_data_size(desc_data)); + + return size; + } + + const VkMutableDescriptorTypeListVALVE *type_list = + &mutable_info->pMutableDescriptorTypeLists[binding]; + for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) { + enum anv_descriptor_data desc_data = + anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]); + size = MAX2(size, anv_descriptor_data_size(desc_data)); + } + + return size; +} + +static bool +anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice, + enum anv_descriptor_data data, + bool sampler) +{ + if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) { + assert(pdevice->has_a64_buffer_access); + return true; + } + + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers); + return sampler ? pdevice->has_bindless_samplers : + pdevice->has_bindless_images; + } + + if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) { + assert(pdevice->has_bindless_images); + return true; + } + + return false; +} + +bool +anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice, + const struct anv_descriptor_set_binding_layout *binding, + bool sampler) +{ + return anv_descriptor_data_supports_bindless(pdevice, binding->data, + sampler); +} + +bool +anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice, + const struct anv_descriptor_set_binding_layout *binding, + bool sampler) +{ + if (pdevice->always_use_bindless) + return anv_descriptor_supports_bindless(pdevice, binding, sampler); + + static const VkDescriptorBindingFlagBits flags_requiring_bindless = + VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT | + VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT | + VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT; + + return (binding->flags & flags_requiring_bindless) != 0; +} + +void anv_GetDescriptorSetLayoutSupport( + VkDevice _device, + const VkDescriptorSetLayoutCreateInfo* pCreateInfo, + VkDescriptorSetLayoutSupport* pSupport) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = device->physical; + + uint32_t surface_count[MESA_VULKAN_SHADER_STAGES] = { 0, }; + VkDescriptorType varying_desc_type = VK_DESCRIPTOR_TYPE_MAX_ENUM; + bool needs_descriptor_buffer = false; + + const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); + const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE); + + for (uint32_t b = 0; b < pCreateInfo->bindingCount; b++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[b]; + + VkDescriptorBindingFlags flags = 0; + if (binding_flags_info && binding_flags_info->bindingCount > 0) { + assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount); + flags = binding_flags_info->pBindingFlags[b]; + } + + enum anv_descriptor_data desc_data = + binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_mutable_type(pdevice, mutable_info, b) : + anv_descriptor_data_for_type(pdevice, binding->descriptorType); + + if (anv_needs_descriptor_buffer(binding->descriptorType, desc_data)) + needs_descriptor_buffer = true; + + if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) + varying_desc_type = binding->descriptorType; + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* There is no real limit on samplers */ + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + /* Inline uniforms don't use a binding */ + break; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false)) + break; + + if (binding->pImmutableSamplers) { + for (uint32_t i = 0; i < binding->descriptorCount; i++) { + ANV_FROM_HANDLE(anv_sampler, sampler, + binding->pImmutableSamplers[i]); + anv_foreach_stage(s, binding->stageFlags) + surface_count[s] += sampler->n_planes; + } + } else { + anv_foreach_stage(s, binding->stageFlags) + surface_count[s] += binding->descriptorCount; + } + break; + + default: + if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false)) + break; + + anv_foreach_stage(s, binding->stageFlags) + surface_count[s] += binding->descriptorCount; + break; + } + } + + for (unsigned s = 0; s < ARRAY_SIZE(surface_count); s++) { + if (needs_descriptor_buffer) + surface_count[s] += 1; + } + + VkDescriptorSetVariableDescriptorCountLayoutSupport *vdcls = + vk_find_struct(pSupport->pNext, + DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT); + if (vdcls != NULL) { + if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + vdcls->maxVariableDescriptorCount = MAX_INLINE_UNIFORM_BLOCK_SIZE; + } else if (varying_desc_type != VK_DESCRIPTOR_TYPE_MAX_ENUM) { + vdcls->maxVariableDescriptorCount = UINT16_MAX; + } else { + vdcls->maxVariableDescriptorCount = 0; + } + } + + bool supported = true; + for (unsigned s = 0; s < ARRAY_SIZE(surface_count); s++) { + /* Our maximum binding table size is 240 and we need to reserve 8 for + * render targets. + */ + if (surface_count[s] > MAX_BINDING_TABLE_SIZE - MAX_RTS) + supported = false; + } + + pSupport->supported = supported; +} + +VkResult anv_CreateDescriptorSetLayout( + VkDevice _device, + const VkDescriptorSetLayoutCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorSetLayout* pSetLayout) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO); + + uint32_t num_bindings = 0; + uint32_t immutable_sampler_count = 0; + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1); + + /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding: + * + * "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or + * VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then + * pImmutableSamplers can be used to initialize a set of immutable + * samplers. [...] If descriptorType is not one of these descriptor + * types, then pImmutableSamplers is ignored. + * + * We need to be careful here and only parse pImmutableSamplers if we + * have one of the right descriptor types. + */ + VkDescriptorType desc_type = pCreateInfo->pBindings[j].descriptorType; + if ((desc_type == VK_DESCRIPTOR_TYPE_SAMPLER || + desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) && + pCreateInfo->pBindings[j].pImmutableSamplers) + immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; + } + + /* We need to allocate descriptor set layouts off the device allocator + * with DEVICE scope because they are reference counted and may not be + * destroyed when vkDestroyDescriptorSetLayout is called. + */ + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct anv_descriptor_set_layout, set_layout, 1); + VK_MULTIALLOC_DECL(&ma, struct anv_descriptor_set_binding_layout, + bindings, num_bindings); + VK_MULTIALLOC_DECL(&ma, struct anv_sampler *, samplers, + immutable_sampler_count); + + if (!vk_object_multizalloc(&device->vk, &ma, NULL, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + set_layout->ref_cnt = 1; + set_layout->binding_count = num_bindings; + + for (uint32_t b = 0; b < num_bindings; b++) { + /* Initialize all binding_layout entries to -1 */ + memset(&set_layout->binding[b], -1, sizeof(set_layout->binding[b])); + + set_layout->binding[b].flags = 0; + set_layout->binding[b].data = 0; + set_layout->binding[b].max_plane_count = 0; + set_layout->binding[b].array_size = 0; + set_layout->binding[b].immutable_samplers = NULL; + } + + /* Initialize all samplers to 0 */ + memset(samplers, 0, immutable_sampler_count * sizeof(*samplers)); + + uint32_t buffer_view_count = 0; + uint32_t dynamic_offset_count = 0; + uint32_t descriptor_buffer_size = 0; + + for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { + const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; + uint32_t b = binding->binding; + /* We temporarily store pCreateInfo->pBindings[] index (plus one) in the + * immutable_samplers pointer. This provides us with a quick-and-dirty + * way to sort the bindings by binding number. + */ + set_layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1); + } + + const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO); + + const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE); + + for (uint32_t b = 0; b < num_bindings; b++) { + /* We stashed the pCreateInfo->pBindings[] index (plus one) in the + * immutable_samplers pointer. Check for NULL (empty binding) and then + * reset it and compute the index. + */ + if (set_layout->binding[b].immutable_samplers == NULL) + continue; + const uint32_t info_idx = + (uintptr_t)(void *)set_layout->binding[b].immutable_samplers - 1; + set_layout->binding[b].immutable_samplers = NULL; + + const VkDescriptorSetLayoutBinding *binding = + &pCreateInfo->pBindings[info_idx]; + + if (binding->descriptorCount == 0) + continue; + + set_layout->binding[b].type = binding->descriptorType; + + if (binding_flags_info && binding_flags_info->bindingCount > 0) { + assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount); + set_layout->binding[b].flags = + binding_flags_info->pBindingFlags[info_idx]; + + /* From the Vulkan spec: + * + * "If VkDescriptorSetLayoutCreateInfo::flags includes + * VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, then + * all elements of pBindingFlags must not include + * VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT, + * VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT, or + * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT" + */ + if (pCreateInfo->flags & + VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + assert(!(set_layout->binding[b].flags & + (VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT | + VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT | + VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT))); + } + } + + set_layout->binding[b].data = + binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_mutable_type(device->physical, mutable_info, b) : + anv_descriptor_data_for_type(device->physical, binding->descriptorType); + + set_layout->binding[b].array_size = binding->descriptorCount; + set_layout->binding[b].descriptor_index = set_layout->descriptor_count; + set_layout->descriptor_count += binding->descriptorCount; + + if (set_layout->binding[b].data & ANV_DESCRIPTOR_BUFFER_VIEW) { + set_layout->binding[b].buffer_view_index = buffer_view_count; + buffer_view_count += binding->descriptorCount; + } + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: + set_layout->binding[b].max_plane_count = 1; + if (binding->pImmutableSamplers) { + set_layout->binding[b].immutable_samplers = samplers; + samplers += binding->descriptorCount; + + for (uint32_t i = 0; i < binding->descriptorCount; i++) { + ANV_FROM_HANDLE(anv_sampler, sampler, + binding->pImmutableSamplers[i]); + + set_layout->binding[b].immutable_samplers[i] = sampler; + if (set_layout->binding[b].max_plane_count < sampler->n_planes) + set_layout->binding[b].max_plane_count = sampler->n_planes; + } + } + break; + + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + set_layout->binding[b].max_plane_count = 1; + break; + + default: + break; + } + + switch (binding->descriptorType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + set_layout->binding[b].dynamic_offset_index = dynamic_offset_count; + set_layout->dynamic_offset_stages[dynamic_offset_count] = binding->stageFlags; + dynamic_offset_count += binding->descriptorCount; + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); + break; + + default: + break; + } + + set_layout->binding[b].descriptor_stride = + binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_size_for_mutable_type(device->physical, mutable_info, b) : + anv_descriptor_size(&set_layout->binding[b]); + + if (binding->descriptorType == + VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Inline uniform blocks are specified to use the descriptor array + * size as the size in bytes of the block. + */ + descriptor_buffer_size = align_u32(descriptor_buffer_size, + ANV_UBO_ALIGNMENT); + set_layout->binding[b].descriptor_offset = descriptor_buffer_size; + descriptor_buffer_size += binding->descriptorCount; + } else { + set_layout->binding[b].descriptor_offset = descriptor_buffer_size; + descriptor_buffer_size += + set_layout->binding[b].descriptor_stride * binding->descriptorCount; + } + + set_layout->shader_stages |= binding->stageFlags; + } + + set_layout->buffer_view_count = buffer_view_count; + set_layout->dynamic_offset_count = dynamic_offset_count; + set_layout->descriptor_buffer_size = descriptor_buffer_size; + + *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout); + + return VK_SUCCESS; +} + +void +anv_descriptor_set_layout_destroy(struct anv_device *device, + struct anv_descriptor_set_layout *layout) +{ + assert(layout->ref_cnt == 0); + vk_object_free(&device->vk, NULL, layout); +} + +static const struct anv_descriptor_set_binding_layout * +set_layout_dynamic_binding(const struct anv_descriptor_set_layout *set_layout) +{ + if (set_layout->binding_count == 0) + return NULL; + + const struct anv_descriptor_set_binding_layout *last_binding = + &set_layout->binding[set_layout->binding_count - 1]; + if (!(last_binding->flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) + return NULL; + + return last_binding; +} + +static uint32_t +set_layout_descriptor_count(const struct anv_descriptor_set_layout *set_layout, + uint32_t var_desc_count) +{ + const struct anv_descriptor_set_binding_layout *dynamic_binding = + set_layout_dynamic_binding(set_layout); + if (dynamic_binding == NULL) + return set_layout->descriptor_count; + + assert(var_desc_count <= dynamic_binding->array_size); + uint32_t shrink = dynamic_binding->array_size - var_desc_count; + + if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + return set_layout->descriptor_count; + + return set_layout->descriptor_count - shrink; +} + +static uint32_t +set_layout_buffer_view_count(const struct anv_descriptor_set_layout *set_layout, + uint32_t var_desc_count) +{ + const struct anv_descriptor_set_binding_layout *dynamic_binding = + set_layout_dynamic_binding(set_layout); + if (dynamic_binding == NULL) + return set_layout->buffer_view_count; + + assert(var_desc_count <= dynamic_binding->array_size); + uint32_t shrink = dynamic_binding->array_size - var_desc_count; + + if (!(dynamic_binding->data & ANV_DESCRIPTOR_BUFFER_VIEW)) + return set_layout->buffer_view_count; + + return set_layout->buffer_view_count - shrink; +} + +uint32_t +anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout, + uint32_t var_desc_count) +{ + const struct anv_descriptor_set_binding_layout *dynamic_binding = + set_layout_dynamic_binding(set_layout); + if (dynamic_binding == NULL) + return ALIGN(set_layout->descriptor_buffer_size, ANV_UBO_ALIGNMENT); + + assert(var_desc_count <= dynamic_binding->array_size); + uint32_t shrink = dynamic_binding->array_size - var_desc_count; + uint32_t set_size; + + if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Inline uniform blocks are specified to use the descriptor array + * size as the size in bytes of the block. + */ + set_size = set_layout->descriptor_buffer_size - shrink; + } else { + set_size = set_layout->descriptor_buffer_size - + shrink * dynamic_binding->descriptor_stride; + } + + return ALIGN(set_size, ANV_UBO_ALIGNMENT); +} + +void anv_DestroyDescriptorSetLayout( + VkDevice _device, + VkDescriptorSetLayout _set_layout, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, _set_layout); + + if (!set_layout) + return; + + anv_descriptor_set_layout_unref(device, set_layout); +} + +#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x)); + +static void +sha1_update_immutable_sampler(struct mesa_sha1 *ctx, + const struct anv_sampler *sampler) +{ + if (!sampler->conversion) + return; + + /* The only thing that affects the shader is ycbcr conversion */ + _mesa_sha1_update(ctx, sampler->conversion, + sizeof(*sampler->conversion)); +} + +static void +sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, + const struct anv_descriptor_set_binding_layout *layout) +{ + SHA1_UPDATE_VALUE(ctx, layout->flags); + SHA1_UPDATE_VALUE(ctx, layout->data); + SHA1_UPDATE_VALUE(ctx, layout->max_plane_count); + SHA1_UPDATE_VALUE(ctx, layout->array_size); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_index); + SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index); + SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset); + + if (layout->immutable_samplers) { + for (uint16_t i = 0; i < layout->array_size; i++) + sha1_update_immutable_sampler(ctx, layout->immutable_samplers[i]); + } +} + +static void +sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, + const struct anv_descriptor_set_layout *layout) +{ + SHA1_UPDATE_VALUE(ctx, layout->binding_count); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_count); + SHA1_UPDATE_VALUE(ctx, layout->shader_stages); + SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count); + SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size); + + for (uint16_t i = 0; i < layout->binding_count; i++) + sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]); +} + +/* + * Pipeline layouts. These have nothing to do with the pipeline. They are + * just multiple descriptor set layouts pasted together + */ + +VkResult anv_CreatePipelineLayout( + VkDevice _device, + const VkPipelineLayoutCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipelineLayout* pPipelineLayout) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_pipeline_layout *layout; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO); + + layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout), + VK_OBJECT_TYPE_PIPELINE_LAYOUT); + if (layout == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + layout->num_sets = pCreateInfo->setLayoutCount; + + unsigned dynamic_offset_count = 0; + + for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { + ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, + pCreateInfo->pSetLayouts[set]); + layout->set[set].layout = set_layout; + anv_descriptor_set_layout_ref(set_layout); + + layout->set[set].dynamic_offset_start = dynamic_offset_count; + dynamic_offset_count += set_layout->dynamic_offset_count; + } + assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS); + + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + for (unsigned s = 0; s < layout->num_sets; s++) { + sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout); + _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start, + sizeof(layout->set[s].dynamic_offset_start)); + } + _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets)); + _mesa_sha1_final(&ctx, layout->sha1); + + *pPipelineLayout = anv_pipeline_layout_to_handle(layout); + + return VK_SUCCESS; +} + +void anv_DestroyPipelineLayout( + VkDevice _device, + VkPipelineLayout _pipelineLayout, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, _pipelineLayout); + + if (!pipeline_layout) + return; + + for (uint32_t i = 0; i < pipeline_layout->num_sets; i++) + anv_descriptor_set_layout_unref(device, pipeline_layout->set[i].layout); + + vk_object_free(&device->vk, pAllocator, pipeline_layout); +} + +/* + * Descriptor pools. + * + * These are implemented using a big pool of memory and a free-list for the + * host memory allocations and a state_stream and a free list for the buffer + * view surface state. The spec allows us to fail to allocate due to + * fragmentation in all cases but two: 1) after pool reset, allocating up + * until the pool size with no freeing must succeed and 2) allocating and + * freeing only descriptor sets with the same layout. Case 1) is easy enough, + * and the free lists lets us recycle blocks for case 2). + */ + +/* The vma heap reserves 0 to mean NULL; we have to offset by some amount to + * ensure we can allocate the entire BO without hitting zero. The actual + * amount doesn't matter. + */ +#define POOL_HEAP_OFFSET 64 + +#define EMPTY 1 + +VkResult anv_CreateDescriptorPool( + VkDevice _device, + const VkDescriptorPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorPool* pDescriptorPool) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_descriptor_pool *pool; + + const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); + const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info = + vk_find_struct_const(pCreateInfo->pNext, + MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE); + + uint32_t descriptor_count = 0; + uint32_t buffer_view_count = 0; + uint32_t descriptor_bo_size = 0; + + for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) { + enum anv_descriptor_data desc_data = + pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_mutable_type(device->physical, mutable_info, i) : + anv_descriptor_data_for_type(device->physical, pCreateInfo->pPoolSizes[i].type); + + if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW) + buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount; + + unsigned desc_data_size = + pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_size_for_mutable_type(device->physical, mutable_info, i) : + anv_descriptor_data_size(desc_data); + + desc_data_size *= pCreateInfo->pPoolSizes[i].descriptorCount; + + /* Combined image sampler descriptors can take up to 3 slots if they + * hold a YCbCr image. + */ + if (pCreateInfo->pPoolSizes[i].type == + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + desc_data_size *= 3; + + if (pCreateInfo->pPoolSizes[i].type == + VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Inline uniform blocks are specified to use the descriptor array + * size as the size in bytes of the block. + */ + assert(inline_info); + desc_data_size += pCreateInfo->pPoolSizes[i].descriptorCount; + } + + descriptor_bo_size += desc_data_size; + + descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; + } + /* We have to align descriptor buffer allocations to 32B so that we can + * push descriptor buffers. This means that each descriptor buffer + * allocated may burn up to 32B of extra space to get the right alignment. + * (Technically, it's at most 28B because we're always going to start at + * least 4B aligned but we're being conservative here.) Allocate enough + * extra space that we can chop it into maxSets pieces and align each one + * of them to 32B. + */ + descriptor_bo_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets; + /* We align inline uniform blocks to ANV_UBO_ALIGNMENT */ + if (inline_info) { + descriptor_bo_size += + ANV_UBO_ALIGNMENT * inline_info->maxInlineUniformBlockBindings; + } + descriptor_bo_size = ALIGN(descriptor_bo_size, 4096); + + const size_t pool_size = + pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) + + descriptor_count * sizeof(struct anv_descriptor) + + buffer_view_count * sizeof(struct anv_buffer_view); + const size_t total_size = sizeof(*pool) + pool_size; + + pool = vk_object_alloc(&device->vk, pAllocator, total_size, + VK_OBJECT_TYPE_DESCRIPTOR_POOL); + if (!pool) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + pool->size = pool_size; + pool->next = 0; + pool->free_list = EMPTY; + pool->host_only = pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_VALVE; + + if (descriptor_bo_size > 0) { + VkResult result = anv_device_alloc_bo(device, + "descriptors", + descriptor_bo_size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + 0 /* explicit_address */, + &pool->bo); + if (result != VK_SUCCESS) { + vk_object_free(&device->vk, pAllocator, pool); + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size); + } else { + pool->bo = NULL; + } + + anv_state_stream_init(&pool->surface_state_stream, + &device->surface_state_pool, 4096); + pool->surface_state_free_list = NULL; + + list_inithead(&pool->desc_sets); + + *pDescriptorPool = anv_descriptor_pool_to_handle(pool); + + return VK_SUCCESS; +} + +void anv_DestroyDescriptorPool( + VkDevice _device, + VkDescriptorPool _pool, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_pool, pool, _pool); + + if (!pool) + return; + + list_for_each_entry_safe(struct anv_descriptor_set, set, + &pool->desc_sets, pool_link) { + anv_descriptor_set_layout_unref(device, set->layout); + } + + if (pool->bo) { + util_vma_heap_finish(&pool->bo_heap); + anv_device_release_bo(device, pool->bo); + } + anv_state_stream_finish(&pool->surface_state_stream); + + vk_object_free(&device->vk, pAllocator, pool); +} + +VkResult anv_ResetDescriptorPool( + VkDevice _device, + VkDescriptorPool descriptorPool, + VkDescriptorPoolResetFlags flags) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool); + + list_for_each_entry_safe(struct anv_descriptor_set, set, + &pool->desc_sets, pool_link) { + anv_descriptor_set_layout_unref(device, set->layout); + } + list_inithead(&pool->desc_sets); + + pool->next = 0; + pool->free_list = EMPTY; + + if (pool->bo) { + util_vma_heap_finish(&pool->bo_heap); + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo->size); + } + + anv_state_stream_finish(&pool->surface_state_stream); + anv_state_stream_init(&pool->surface_state_stream, + &device->surface_state_pool, 4096); + pool->surface_state_free_list = NULL; + + return VK_SUCCESS; +} + +struct pool_free_list_entry { + uint32_t next; + uint32_t size; +}; + +static VkResult +anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool, + uint32_t size, + struct anv_descriptor_set **set) +{ + if (size <= pool->size - pool->next) { + *set = (struct anv_descriptor_set *) (pool->data + pool->next); + (*set)->size = size; + pool->next += size; + return VK_SUCCESS; + } else { + struct pool_free_list_entry *entry; + uint32_t *link = &pool->free_list; + for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) { + entry = (struct pool_free_list_entry *) (pool->data + f); + if (size <= entry->size) { + *link = entry->next; + *set = (struct anv_descriptor_set *) entry; + (*set)->size = entry->size; + return VK_SUCCESS; + } + link = &entry->next; + } + + if (pool->free_list != EMPTY) { + return VK_ERROR_FRAGMENTED_POOL; + } else { + return VK_ERROR_OUT_OF_POOL_MEMORY; + } + } +} + +static void +anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool, + struct anv_descriptor_set *set) +{ + /* Put the descriptor set allocation back on the free list. */ + const uint32_t index = (char *) set - pool->data; + if (index + set->size == pool->next) { + pool->next = index; + } else { + struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set; + entry->next = pool->free_list; + entry->size = set->size; + pool->free_list = (char *) entry - pool->data; + } +} + +struct surface_state_free_list_entry { + void *next; + struct anv_state state; +}; + +static struct anv_state +anv_descriptor_pool_alloc_state(struct anv_descriptor_pool *pool) +{ + assert(!pool->host_only); + + struct surface_state_free_list_entry *entry = + pool->surface_state_free_list; + + if (entry) { + struct anv_state state = entry->state; + pool->surface_state_free_list = entry->next; + assert(state.alloc_size == 64); + return state; + } else { + return anv_state_stream_alloc(&pool->surface_state_stream, 64, 64); + } +} + +static void +anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool, + struct anv_state state) +{ + assert(state.alloc_size); + /* Put the buffer view surface state back on the free list. */ + struct surface_state_free_list_entry *entry = state.map; + entry->next = pool->surface_state_free_list; + entry->state = state; + pool->surface_state_free_list = entry; +} + +size_t +anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout, + uint32_t var_desc_count) +{ + const uint32_t descriptor_count = + set_layout_descriptor_count(layout, var_desc_count); + const uint32_t buffer_view_count = + set_layout_buffer_view_count(layout, var_desc_count); + + return sizeof(struct anv_descriptor_set) + + descriptor_count * sizeof(struct anv_descriptor) + + buffer_view_count * sizeof(struct anv_buffer_view); +} + +static VkResult +anv_descriptor_set_create(struct anv_device *device, + struct anv_descriptor_pool *pool, + struct anv_descriptor_set_layout *layout, + uint32_t var_desc_count, + struct anv_descriptor_set **out_set) +{ + struct anv_descriptor_set *set; + const size_t size = anv_descriptor_set_layout_size(layout, var_desc_count); + + VkResult result = anv_descriptor_pool_alloc_set(pool, size, &set); + if (result != VK_SUCCESS) + return result; + + uint32_t descriptor_buffer_size = + anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count); + + set->desc_surface_state = ANV_STATE_NULL; + + if (descriptor_buffer_size) { + uint64_t pool_vma_offset = + util_vma_heap_alloc(&pool->bo_heap, descriptor_buffer_size, + ANV_UBO_ALIGNMENT); + if (pool_vma_offset == 0) { + anv_descriptor_pool_free_set(pool, set); + return vk_error(pool, VK_ERROR_FRAGMENTED_POOL); + } + assert(pool_vma_offset >= POOL_HEAP_OFFSET && + pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); + set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET; + set->desc_mem.alloc_size = descriptor_buffer_size; + set->desc_mem.map = pool->bo->map + set->desc_mem.offset; + + set->desc_addr = (struct anv_address) { + .bo = pool->bo, + .offset = set->desc_mem.offset, + }; + + enum isl_format format = + anv_isl_format_for_descriptor_type(device, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + + if (!pool->host_only) { + set->desc_surface_state = anv_descriptor_pool_alloc_state(pool); + anv_fill_buffer_surface_state(device, set->desc_surface_state, + format, ISL_SWIZZLE_IDENTITY, + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, + set->desc_addr, + descriptor_buffer_size, 1); + } + } else { + set->desc_mem = ANV_STATE_NULL; + set->desc_addr = (struct anv_address) { .bo = NULL, .offset = 0 }; + } + + vk_object_base_init(&device->vk, &set->base, + VK_OBJECT_TYPE_DESCRIPTOR_SET); + set->pool = pool; + set->layout = layout; + anv_descriptor_set_layout_ref(layout); + + set->buffer_view_count = + set_layout_buffer_view_count(layout, var_desc_count); + set->descriptor_count = + set_layout_descriptor_count(layout, var_desc_count); + + set->buffer_views = + (struct anv_buffer_view *) &set->descriptors[set->descriptor_count]; + + /* By defining the descriptors to be zero now, we can later verify that + * a descriptor has not been populated with user data. + */ + memset(set->descriptors, 0, + sizeof(struct anv_descriptor) * set->descriptor_count); + + /* Go through and fill out immutable samplers if we have any */ + for (uint32_t b = 0; b < layout->binding_count; b++) { + if (layout->binding[b].immutable_samplers) { + for (uint32_t i = 0; i < layout->binding[b].array_size; i++) { + /* The type will get changed to COMBINED_IMAGE_SAMPLER in + * UpdateDescriptorSets if needed. However, if the descriptor + * set has an immutable sampler, UpdateDescriptorSets may never + * touch it, so we need to make sure it's 100% valid now. + * + * We don't need to actually provide a sampler because the helper + * will always write in the immutable sampler regardless of what + * is in the sampler parameter. + */ + VkDescriptorImageInfo info = { }; + anv_descriptor_set_write_image_view(device, set, &info, + VK_DESCRIPTOR_TYPE_SAMPLER, + b, i); + } + } + } + + /* Allocate null surface state for the buffer views since + * we lazy allocate this in the write anyway. + */ + if (!pool->host_only) { + for (uint32_t b = 0; b < set->buffer_view_count; b++) { + set->buffer_views[b].surface_state = + anv_descriptor_pool_alloc_state(pool); + } + } + + list_addtail(&set->pool_link, &pool->desc_sets); + + *out_set = set; + + return VK_SUCCESS; +} + +static void +anv_descriptor_set_destroy(struct anv_device *device, + struct anv_descriptor_pool *pool, + struct anv_descriptor_set *set) +{ + anv_descriptor_set_layout_unref(device, set->layout); + + if (set->desc_mem.alloc_size) { + util_vma_heap_free(&pool->bo_heap, + (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET, + set->desc_mem.alloc_size); + if (set->desc_surface_state.alloc_size) + anv_descriptor_pool_free_state(pool, set->desc_surface_state); + } + + if (!pool->host_only) { + for (uint32_t b = 0; b < set->buffer_view_count; b++) { + if (set->buffer_views[b].surface_state.alloc_size) + anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state); + } + } + + list_del(&set->pool_link); + + vk_object_base_finish(&set->base); + anv_descriptor_pool_free_set(pool, set); +} + +VkResult anv_AllocateDescriptorSets( + VkDevice _device, + const VkDescriptorSetAllocateInfo* pAllocateInfo, + VkDescriptorSet* pDescriptorSets) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_pool, pool, pAllocateInfo->descriptorPool); + + VkResult result = VK_SUCCESS; + struct anv_descriptor_set *set = NULL; + uint32_t i; + + const VkDescriptorSetVariableDescriptorCountAllocateInfo *vdcai = + vk_find_struct_const(pAllocateInfo->pNext, + DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO); + + for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) { + ANV_FROM_HANDLE(anv_descriptor_set_layout, layout, + pAllocateInfo->pSetLayouts[i]); + + uint32_t var_desc_count = 0; + if (vdcai != NULL && vdcai->descriptorSetCount > 0) { + assert(vdcai->descriptorSetCount == pAllocateInfo->descriptorSetCount); + var_desc_count = vdcai->pDescriptorCounts[i]; + } + + result = anv_descriptor_set_create(device, pool, layout, + var_desc_count, &set); + if (result != VK_SUCCESS) + break; + + pDescriptorSets[i] = anv_descriptor_set_to_handle(set); + } + + if (result != VK_SUCCESS) + anv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool, + i, pDescriptorSets); + + return result; +} + +VkResult anv_FreeDescriptorSets( + VkDevice _device, + VkDescriptorPool descriptorPool, + uint32_t count, + const VkDescriptorSet* pDescriptorSets) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool); + + for (uint32_t i = 0; i < count; i++) { + ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]); + + if (!set) + continue; + + anv_descriptor_set_destroy(device, pool, set); + } + + return VK_SUCCESS; +} + +static void +anv_descriptor_set_write_image_param(uint32_t *param_desc_map, + const struct brw_image_param *param) +{ +#define WRITE_PARAM_FIELD(field, FIELD) \ + for (unsigned i = 0; i < ARRAY_SIZE(param->field); i++) \ + param_desc_map[BRW_IMAGE_PARAM_##FIELD##_OFFSET + i] = param->field[i] + + WRITE_PARAM_FIELD(offset, OFFSET); + WRITE_PARAM_FIELD(size, SIZE); + WRITE_PARAM_FIELD(stride, STRIDE); + WRITE_PARAM_FIELD(tiling, TILING); + WRITE_PARAM_FIELD(swizzling, SWIZZLING); + WRITE_PARAM_FIELD(size, SIZE); + +#undef WRITE_PARAM_FIELD +} + +static uint32_t +anv_surface_state_to_handle(struct anv_state state) +{ + /* Bits 31:12 of the bindless surface offset in the extended message + * descriptor is bits 25:6 of the byte-based address. + */ + assert(state.offset >= 0); + uint32_t offset = state.offset; + assert((offset & 0x3f) == 0 && offset < (1 << 26)); + return offset << 6; +} + +void +anv_descriptor_set_write_image_view(struct anv_device *device, + struct anv_descriptor_set *set, + const VkDescriptorImageInfo * const info, + VkDescriptorType type, + uint32_t binding, + uint32_t element) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &set->layout->binding[binding]; + struct anv_descriptor *desc = + &set->descriptors[bind_layout->descriptor_index + element]; + struct anv_image_view *image_view = NULL; + struct anv_sampler *sampler = NULL; + + /* We get called with just VK_DESCRIPTOR_TYPE_SAMPLER as part of descriptor + * set initialization to set the bindless samplers. + */ + assert(type == bind_layout->type || + type == VK_DESCRIPTOR_TYPE_SAMPLER || + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE); + + switch (type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + sampler = bind_layout->immutable_samplers ? + bind_layout->immutable_samplers[element] : + anv_sampler_from_handle(info->sampler); + break; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + image_view = anv_image_view_from_handle(info->imageView); + sampler = bind_layout->immutable_samplers ? + bind_layout->immutable_samplers[element] : + anv_sampler_from_handle(info->sampler); + break; + + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + image_view = anv_image_view_from_handle(info->imageView); + break; + + default: + unreachable("invalid descriptor type"); + } + + *desc = (struct anv_descriptor) { + .type = type, + .layout = info->imageLayout, + .image_view = image_view, + .sampler = sampler, + }; + + if (set->pool && set->pool->host_only) + return; + + void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + + element * bind_layout->descriptor_stride; + memset(desc_map, 0, bind_layout->descriptor_stride); + enum anv_descriptor_data data = + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_type(device->physical, type) : + bind_layout->data; + + + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + struct anv_sampled_image_descriptor desc_data[3]; + memset(desc_data, 0, sizeof(desc_data)); + + if (image_view) { + for (unsigned p = 0; p < image_view->n_planes; p++) { + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + image_view->planes[p].general_sampler_surface_state : + image_view->planes[p].optimal_sampler_surface_state; + desc_data[p].image = anv_surface_state_to_handle(sstate.state); + } + } + + if (sampler) { + for (unsigned p = 0; p < sampler->n_planes; p++) + desc_data[p].sampler = sampler->bindless_state.offset + p * 32; + } + + /* We may have max_plane_count < 0 if this isn't a sampled image but it + * can be no more than the size of our array of handles. + */ + assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data)); + memcpy(desc_map, desc_data, + MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0])); + } + + if (image_view == NULL) + return; + + if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) { + assert(!(data & ANV_DESCRIPTOR_IMAGE_PARAM)); + assert(image_view->n_planes == 1); + struct anv_storage_image_descriptor desc_data = { + .vanilla = anv_surface_state_to_handle( + image_view->planes[0].storage_surface_state.state), + .lowered = anv_surface_state_to_handle( + image_view->planes[0].lowered_storage_surface_state.state), + }; + memcpy(desc_map, &desc_data, sizeof(desc_data)); + } + + if (data & ANV_DESCRIPTOR_IMAGE_PARAM) { + /* Storage images can only ever have one plane */ + assert(image_view->n_planes == 1); + const struct brw_image_param *image_param = + &image_view->planes[0].lowered_storage_image_param; + + anv_descriptor_set_write_image_param(desc_map, image_param); + } + + if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) { + assert(!(data & ANV_DESCRIPTOR_SAMPLED_IMAGE)); + assert(image_view); + struct anv_texture_swizzle_descriptor desc_data[3]; + memset(desc_data, 0, sizeof(desc_data)); + + for (unsigned p = 0; p < image_view->n_planes; p++) { + desc_data[p] = (struct anv_texture_swizzle_descriptor) { + .swizzle = { + (uint8_t)image_view->planes[p].isl.swizzle.r, + (uint8_t)image_view->planes[p].isl.swizzle.g, + (uint8_t)image_view->planes[p].isl.swizzle.b, + (uint8_t)image_view->planes[p].isl.swizzle.a, + }, + }; + } + memcpy(desc_map, desc_data, + MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0])); + } +} + +void +anv_descriptor_set_write_buffer_view(struct anv_device *device, + struct anv_descriptor_set *set, + VkDescriptorType type, + struct anv_buffer_view *buffer_view, + uint32_t binding, + uint32_t element) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &set->layout->binding[binding]; + struct anv_descriptor *desc = + &set->descriptors[bind_layout->descriptor_index + element]; + + assert(type == bind_layout->type || + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE); + + *desc = (struct anv_descriptor) { + .type = type, + .buffer_view = buffer_view, + }; + + if (set->pool && set->pool->host_only) + return; + + enum anv_descriptor_data data = + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_type(device->physical, type) : + bind_layout->data; + + void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + + element * bind_layout->descriptor_stride; + + if (buffer_view == NULL) { + memset(desc_map, 0, bind_layout->descriptor_stride); + return; + } + + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + struct anv_sampled_image_descriptor desc_data = { + .image = anv_surface_state_to_handle(buffer_view->surface_state), + }; + memcpy(desc_map, &desc_data, sizeof(desc_data)); + } + + if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) { + assert(!(data & ANV_DESCRIPTOR_IMAGE_PARAM)); + struct anv_storage_image_descriptor desc_data = { + .vanilla = anv_surface_state_to_handle( + buffer_view->storage_surface_state), + .lowered = anv_surface_state_to_handle( + buffer_view->lowered_storage_surface_state), + }; + memcpy(desc_map, &desc_data, sizeof(desc_data)); + } + + if (data & ANV_DESCRIPTOR_IMAGE_PARAM) { + anv_descriptor_set_write_image_param(desc_map, + &buffer_view->lowered_storage_image_param); + } +} + +void +anv_descriptor_set_write_buffer(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_state_stream *alloc_stream, + VkDescriptorType type, + struct anv_buffer *buffer, + uint32_t binding, + uint32_t element, + VkDeviceSize offset, + VkDeviceSize range) +{ + assert(alloc_stream || set->pool); + + const struct anv_descriptor_set_binding_layout *bind_layout = + &set->layout->binding[binding]; + struct anv_descriptor *desc = + &set->descriptors[bind_layout->descriptor_index + element]; + + assert(type == bind_layout->type || + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE); + + *desc = (struct anv_descriptor) { + .type = type, + .offset = offset, + .range = range, + .buffer = buffer, + }; + + if (set->pool && set->pool->host_only) + return; + + void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + + element * bind_layout->descriptor_stride; + + if (buffer == NULL) { + memset(desc_map, 0, bind_layout->descriptor_stride); + return; + } + + struct anv_address bind_addr = anv_address_add(buffer->address, offset); + uint64_t bind_range = vk_buffer_range(&buffer->vk, offset, range); + enum anv_descriptor_data data = + bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ? + anv_descriptor_data_for_type(device->physical, type) : + bind_layout->data; + + /* We report a bounds checking alignment of 32B for the sake of block + * messages which read an entire register worth at a time. + */ + if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) + bind_range = align_u64(bind_range, ANV_UBO_ALIGNMENT); + + if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) { + struct anv_address_range_descriptor desc_data = { + .address = anv_address_physical(bind_addr), + .range = bind_range, + }; + memcpy(desc_map, &desc_data, sizeof(desc_data)); + } + + if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) + return; + + assert(data & ANV_DESCRIPTOR_BUFFER_VIEW); + struct anv_buffer_view *bview = + &set->buffer_views[bind_layout->buffer_view_index + element]; + + bview->range = bind_range; + bview->address = bind_addr; + + /* If we're writing descriptors through a push command, we need to + * allocate the surface state from the command buffer. Otherwise it will + * be allocated by the descriptor pool when calling + * vkAllocateDescriptorSets. */ + if (alloc_stream) { + bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64); + } + + assert(bview->surface_state.alloc_size); + + isl_surf_usage_flags_t usage = + (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER || + type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ? + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT : + ISL_SURF_USAGE_STORAGE_BIT; + + enum isl_format format = anv_isl_format_for_descriptor_type(device, type); + anv_fill_buffer_surface_state(device, bview->surface_state, + format, ISL_SWIZZLE_IDENTITY, + usage, bind_addr, bind_range, 1); + desc->set_buffer_view = bview; +} + +void +anv_descriptor_set_write_inline_uniform_data(struct anv_device *device, + struct anv_descriptor_set *set, + uint32_t binding, + const void *data, + size_t offset, + size_t size) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &set->layout->binding[binding]; + + assert(bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM); + + void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset; + + memcpy(desc_map + offset, data, size); +} + +void +anv_descriptor_set_write_acceleration_structure(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_acceleration_structure *accel, + uint32_t binding, + uint32_t element) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &set->layout->binding[binding]; + struct anv_descriptor *desc = + &set->descriptors[bind_layout->descriptor_index + element]; + + assert(bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE); + *desc = (struct anv_descriptor) { + .type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR, + .accel_struct = accel, + }; + + if (set->pool && set->pool->host_only) + return; + + struct anv_address_range_descriptor desc_data = { }; + if (accel != NULL) { + desc_data.address = anv_address_physical(accel->address); + desc_data.range = accel->size; + } + assert(sizeof(desc_data) <= bind_layout->descriptor_stride); + + void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + + element * bind_layout->descriptor_stride; + memcpy(desc_map, &desc_data, sizeof(desc_data)); +} + +void anv_UpdateDescriptorSets( + VkDevice _device, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites, + uint32_t descriptorCopyCount, + const VkCopyDescriptorSet* pDescriptorCopies) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + for (uint32_t i = 0; i < descriptorWriteCount; i++) { + const VkWriteDescriptorSet *write = &pDescriptorWrites[i]; + ANV_FROM_HANDLE(anv_descriptor_set, set, write->dstSet); + + switch (write->descriptorType) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + anv_descriptor_set_write_image_view(device, set, + write->pImageInfo + j, + write->descriptorType, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_buffer_view, bview, + write->pTexelBufferView[j]); + + anv_descriptor_set_write_buffer_view(device, set, + write->descriptorType, + bview, + write->dstBinding, + write->dstArrayElement + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer); + + anv_descriptor_set_write_buffer(device, set, + NULL, + write->descriptorType, + buffer, + write->dstBinding, + write->dstArrayElement + j, + write->pBufferInfo[j].offset, + write->pBufferInfo[j].range); + } + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + const VkWriteDescriptorSetInlineUniformBlock *inline_write = + vk_find_struct_const(write->pNext, + WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); + assert(inline_write->dataSize == write->descriptorCount); + anv_descriptor_set_write_inline_uniform_data(device, set, + write->dstBinding, + inline_write->pData, + write->dstArrayElement, + inline_write->dataSize); + break; + } + + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + const VkWriteDescriptorSetAccelerationStructureKHR *accel_write = + vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR); + assert(accel_write->accelerationStructureCount == + write->descriptorCount); + for (uint32_t j = 0; j < write->descriptorCount; j++) { + ANV_FROM_HANDLE(anv_acceleration_structure, accel, + accel_write->pAccelerationStructures[j]); + anv_descriptor_set_write_acceleration_structure(device, set, accel, + write->dstBinding, + write->dstArrayElement + j); + } + break; + } + + default: + break; + } + } + + for (uint32_t i = 0; i < descriptorCopyCount; i++) { + const VkCopyDescriptorSet *copy = &pDescriptorCopies[i]; + ANV_FROM_HANDLE(anv_descriptor_set, src, copy->srcSet); + ANV_FROM_HANDLE(anv_descriptor_set, dst, copy->dstSet); + + const struct anv_descriptor_set_binding_layout *src_layout = + &src->layout->binding[copy->srcBinding]; + struct anv_descriptor *src_desc = + &src->descriptors[src_layout->descriptor_index]; + src_desc += copy->srcArrayElement; + + if (src_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + anv_descriptor_set_write_inline_uniform_data(device, dst, + copy->dstBinding, + src->desc_mem.map + src_layout->descriptor_offset + copy->srcArrayElement, + copy->dstArrayElement, + copy->descriptorCount); + continue; + } + + + /* Copy CPU side data */ + for (uint32_t j = 0; j < copy->descriptorCount; j++) { + switch(src_desc[j].type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { + VkDescriptorImageInfo info = { + .sampler = anv_sampler_to_handle(src_desc[j].sampler), + .imageView = anv_image_view_to_handle(src_desc[j].image_view), + .imageLayout = src_desc[j].layout + }; + anv_descriptor_set_write_image_view(device, dst, + &info, + src_desc[j].type, + copy->dstBinding, + copy->dstArrayElement + j); + break; + } + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { + anv_descriptor_set_write_buffer_view(device, dst, + src_desc[j].type, + src_desc[j].buffer_view, + copy->dstBinding, + copy->dstArrayElement + j); + break; + } + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + anv_descriptor_set_write_buffer(device, dst, + NULL, + src_desc[j].type, + src_desc[j].buffer, + copy->dstBinding, + copy->dstArrayElement + j, + src_desc[j].offset, + src_desc[j].range); + break; + } + + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: { + anv_descriptor_set_write_acceleration_structure(device, dst, + src_desc[j].accel_struct, + copy->dstBinding, + copy->dstArrayElement + j); + break; + } + + default: + break; + } + } + } +} + +/* + * Descriptor update templates. + */ + +void +anv_descriptor_set_write_template(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_state_stream *alloc_stream, + const struct anv_descriptor_update_template *template, + const void *data) +{ + for (uint32_t i = 0; i < template->entry_count; i++) { + const struct anv_descriptor_template_entry *entry = + &template->entries[i]; + + switch (entry->type) { + case VK_DESCRIPTOR_TYPE_SAMPLER: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorImageInfo *info = + data + entry->offset + j * entry->stride; + anv_descriptor_set_write_image_view(device, set, + info, entry->type, + entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkBufferView *_bview = + data + entry->offset + j * entry->stride; + ANV_FROM_HANDLE(anv_buffer_view, bview, *_bview); + + anv_descriptor_set_write_buffer_view(device, set, + entry->type, + bview, + entry->binding, + entry->array_element + j); + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + for (uint32_t j = 0; j < entry->array_count; j++) { + const VkDescriptorBufferInfo *info = + data + entry->offset + j * entry->stride; + ANV_FROM_HANDLE(anv_buffer, buffer, info->buffer); + + anv_descriptor_set_write_buffer(device, set, + alloc_stream, + entry->type, + buffer, + entry->binding, + entry->array_element + j, + info->offset, info->range); + } + break; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + anv_descriptor_set_write_inline_uniform_data(device, set, + entry->binding, + data + entry->offset, + entry->array_element, + entry->array_count); + break; + + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: + for (uint32_t j = 0; j < entry->array_count; j++) { + VkAccelerationStructureKHR *accel_obj = + (VkAccelerationStructureKHR *)(data + entry->offset + j * entry->stride); + ANV_FROM_HANDLE(anv_acceleration_structure, accel, *accel_obj); + + anv_descriptor_set_write_acceleration_structure(device, set, + accel, + entry->binding, + entry->array_element + j); + } + break; + + default: + break; + } + } +} + +VkResult anv_CreateDescriptorUpdateTemplate( + VkDevice _device, + const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorUpdateTemplate* pDescriptorUpdateTemplate) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_descriptor_update_template *template; + + size_t size = sizeof(*template) + + pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]); + template = vk_object_alloc(&device->vk, pAllocator, size, + VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE); + if (template == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + template->bind_point = pCreateInfo->pipelineBindPoint; + + if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET) + template->set = pCreateInfo->set; + + template->entry_count = pCreateInfo->descriptorUpdateEntryCount; + for (uint32_t i = 0; i < template->entry_count; i++) { + const VkDescriptorUpdateTemplateEntry *pEntry = + &pCreateInfo->pDescriptorUpdateEntries[i]; + + template->entries[i] = (struct anv_descriptor_template_entry) { + .type = pEntry->descriptorType, + .binding = pEntry->dstBinding, + .array_element = pEntry->dstArrayElement, + .array_count = pEntry->descriptorCount, + .offset = pEntry->offset, + .stride = pEntry->stride, + }; + } + + *pDescriptorUpdateTemplate = + anv_descriptor_update_template_to_handle(template); + + return VK_SUCCESS; +} + +void anv_DestroyDescriptorUpdateTemplate( + VkDevice _device, + VkDescriptorUpdateTemplate descriptorUpdateTemplate, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_update_template, template, + descriptorUpdateTemplate); + + if (!template) + return; + + vk_object_free(&device->vk, pAllocator, template); +} + +void anv_UpdateDescriptorSetWithTemplate( + VkDevice _device, + VkDescriptorSet descriptorSet, + VkDescriptorUpdateTemplate descriptorUpdateTemplate, + const void* pData) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet); + ANV_FROM_HANDLE(anv_descriptor_update_template, template, + descriptorUpdateTemplate); + + anv_descriptor_set_write_template(device, set, NULL, template, pData); +} diff --git a/src/intel/vulkan_hasvk/anv_device.c b/src/intel/vulkan_hasvk/anv_device.c new file mode 100644 index 00000000000..5c833e9f8d3 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_device.c @@ -0,0 +1,4834 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#ifdef MAJOR_IN_MKDEV +#include +#endif +#ifdef MAJOR_IN_SYSMACROS +#include +#endif +#include +#include +#include +#include +#include "drm-uapi/drm_fourcc.h" +#include "drm-uapi/drm.h" +#include + +#include "anv_private.h" +#include "anv_measure.h" +#include "util/debug.h" +#include "util/build_id.h" +#include "util/disk_cache.h" +#include "util/mesa-sha1.h" +#include "util/os_file.h" +#include "util/os_misc.h" +#include "util/u_atomic.h" +#include "util/u_string.h" +#include "util/driconf.h" +#include "git_sha1.h" +#include "vk_util.h" +#include "vk_deferred_operation.h" +#include "vk_drm_syncobj.h" +#include "common/intel_aux_map.h" +#include "common/intel_defines.h" +#include "common/intel_uuid.h" +#include "perf/intel_perf.h" + +#include "genxml/gen7_pack.h" +#include "genxml/genX_bits.h" + +static const driOptionDescription anv_dri_options[] = { + DRI_CONF_SECTION_PERFORMANCE + DRI_CONF_ADAPTIVE_SYNC(true) + DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0) + DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false) + DRI_CONF_VK_XWAYLAND_WAIT_READY(true) + DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(false) + DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false) + DRI_CONF_SECTION_END + + DRI_CONF_SECTION_DEBUG + DRI_CONF_ALWAYS_FLUSH_CACHE(false) + DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false) + DRI_CONF_LIMIT_TRIG_INPUT_RANGE(false) + DRI_CONF_SECTION_END +}; + +/* This is probably far to big but it reflects the max size used for messages + * in OpenGLs KHR_debug. + */ +#define MAX_DEBUG_MESSAGE_LENGTH 4096 + +/* Render engine timestamp register */ +#define TIMESTAMP 0x2358 + +/* The "RAW" clocks on Linux are called "FAST" on FreeBSD */ +#if !defined(CLOCK_MONOTONIC_RAW) && defined(CLOCK_MONOTONIC_FAST) +#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST +#endif + +static void +compiler_debug_log(void *data, UNUSED unsigned *id, const char *fmt, ...) +{ + char str[MAX_DEBUG_MESSAGE_LENGTH]; + struct anv_device *device = (struct anv_device *)data; + UNUSED struct anv_instance *instance = device->physical->instance; + + va_list args; + va_start(args, fmt); + (void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args); + va_end(args); + + //vk_logd(VK_LOG_NO_OBJS(&instance->vk), "%s", str); +} + +static void +compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + if (INTEL_DEBUG(DEBUG_PERF)) + mesa_logd_v(fmt, args); + + va_end(args); +} + +#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \ + defined(VK_USE_PLATFORM_XCB_KHR) || \ + defined(VK_USE_PLATFORM_XLIB_KHR) || \ + defined(VK_USE_PLATFORM_DISPLAY_KHR) +#define ANV_USE_WSI_PLATFORM +#endif + +#ifdef ANDROID +#define ANV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION) +#else +#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION) +#endif + +VkResult anv_EnumerateInstanceVersion( + uint32_t* pApiVersion) +{ + *pApiVersion = ANV_API_VERSION; + return VK_SUCCESS; +} + +static const struct vk_instance_extension_table instance_extensions = { + .KHR_device_group_creation = true, + .KHR_external_fence_capabilities = true, + .KHR_external_memory_capabilities = true, + .KHR_external_semaphore_capabilities = true, + .KHR_get_physical_device_properties2 = true, + .EXT_debug_report = true, + .EXT_debug_utils = true, + +#ifdef ANV_USE_WSI_PLATFORM + .KHR_get_surface_capabilities2 = true, + .KHR_surface = true, + .KHR_surface_protected_capabilities = true, +#endif +#ifdef VK_USE_PLATFORM_WAYLAND_KHR + .KHR_wayland_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XCB_KHR + .KHR_xcb_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XLIB_KHR + .KHR_xlib_surface = true, +#endif +#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT + .EXT_acquire_xlib_display = true, +#endif +#ifdef VK_USE_PLATFORM_DISPLAY_KHR + .KHR_display = true, + .KHR_get_display_properties2 = true, + .EXT_direct_mode_display = true, + .EXT_display_surface_counter = true, + .EXT_acquire_drm_display = true, +#endif +}; + +static void +get_device_extensions(const struct anv_physical_device *device, + struct vk_device_extension_table *ext) +{ + const bool has_syncobj_wait = + (device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT) != 0; + + const bool nv_mesh_shading_enabled = + env_var_as_boolean("ANV_EXPERIMENTAL_NV_MESH_SHADER", false); + + *ext = (struct vk_device_extension_table) { + .KHR_8bit_storage = device->info.ver >= 8, + .KHR_16bit_storage = device->info.ver >= 8, + .KHR_bind_memory2 = true, + .KHR_buffer_device_address = device->has_a64_buffer_access, + .KHR_copy_commands2 = true, + .KHR_create_renderpass2 = true, + .KHR_dedicated_allocation = true, + .KHR_deferred_host_operations = true, + .KHR_depth_stencil_resolve = true, + .KHR_descriptor_update_template = true, + .KHR_device_group = true, + .KHR_draw_indirect_count = true, + .KHR_driver_properties = true, + .KHR_dynamic_rendering = true, + .KHR_external_fence = has_syncobj_wait, + .KHR_external_fence_fd = has_syncobj_wait, + .KHR_external_memory = true, + .KHR_external_memory_fd = true, + .KHR_external_semaphore = true, + .KHR_external_semaphore_fd = true, + .KHR_format_feature_flags2 = true, + .KHR_fragment_shading_rate = device->info.ver >= 11, + .KHR_get_memory_requirements2 = true, + .KHR_image_format_list = true, + .KHR_imageless_framebuffer = true, +#ifdef ANV_USE_WSI_PLATFORM + .KHR_incremental_present = true, +#endif + .KHR_maintenance1 = true, + .KHR_maintenance2 = true, + .KHR_maintenance3 = true, + .KHR_maintenance4 = true, + .KHR_multiview = true, + .KHR_performance_query = + !anv_use_relocations(device) && device->perf && + (device->perf->i915_perf_version >= 3 || + INTEL_DEBUG(DEBUG_NO_OACONFIG)) && + device->use_call_secondary, + .KHR_pipeline_executable_properties = true, + .KHR_push_descriptor = true, + .KHR_ray_query = device->info.has_ray_tracing, + .KHR_relaxed_block_layout = true, + .KHR_sampler_mirror_clamp_to_edge = true, + .KHR_sampler_ycbcr_conversion = true, + .KHR_separate_depth_stencil_layouts = true, + .KHR_shader_atomic_int64 = device->info.ver >= 9, + .KHR_shader_clock = true, + .KHR_shader_draw_parameters = true, + .KHR_shader_float16_int8 = device->info.ver >= 8, + .KHR_shader_float_controls = device->info.ver >= 8, + .KHR_shader_integer_dot_product = true, + .KHR_shader_non_semantic_info = true, + .KHR_shader_subgroup_extended_types = device->info.ver >= 8, + .KHR_shader_subgroup_uniform_control_flow = true, + .KHR_shader_terminate_invocation = true, + .KHR_spirv_1_4 = true, + .KHR_storage_buffer_storage_class = true, +#ifdef ANV_USE_WSI_PLATFORM + .KHR_swapchain = true, + .KHR_swapchain_mutable_format = true, +#endif + .KHR_synchronization2 = true, + .KHR_timeline_semaphore = true, + .KHR_uniform_buffer_standard_layout = true, + .KHR_variable_pointers = true, + .KHR_vulkan_memory_model = true, + .KHR_workgroup_memory_explicit_layout = true, + .KHR_zero_initialize_workgroup_memory = true, + .EXT_4444_formats = true, + .EXT_border_color_swizzle = device->info.ver >= 8, + .EXT_buffer_device_address = device->has_a64_buffer_access, + .EXT_calibrated_timestamps = device->has_reg_timestamp, + .EXT_color_write_enable = true, + .EXT_conditional_rendering = device->info.verx10 >= 75, + .EXT_conservative_rasterization = device->info.ver >= 9, + .EXT_custom_border_color = device->info.ver >= 8, + .EXT_depth_clip_control = true, + .EXT_depth_clip_enable = true, + .EXT_descriptor_indexing = device->has_a64_buffer_access && + device->has_bindless_images, +#ifdef VK_USE_PLATFORM_DISPLAY_KHR + .EXT_display_control = true, +#endif + .EXT_extended_dynamic_state = true, + .EXT_extended_dynamic_state2 = true, + .EXT_external_memory_dma_buf = true, + .EXT_external_memory_host = true, + .EXT_fragment_shader_interlock = device->info.ver >= 9, + .EXT_global_priority = device->max_context_priority >= + INTEL_CONTEXT_MEDIUM_PRIORITY, + .EXT_global_priority_query = device->max_context_priority >= + INTEL_CONTEXT_MEDIUM_PRIORITY, + .EXT_host_query_reset = true, + .EXT_image_2d_view_of_3d = true, + .EXT_image_robustness = true, + .EXT_image_drm_format_modifier = true, + .EXT_image_view_min_lod = true, + .EXT_index_type_uint8 = true, + .EXT_inline_uniform_block = true, + .EXT_line_rasterization = true, + /* Enable the extension only if we have support on both the local & + * system memory + */ + .EXT_memory_budget = (!device->info.has_local_mem || + device->vram_mappable.available > 0) && + device->sys.available, + .EXT_non_seamless_cube_map = true, + .EXT_pci_bus_info = true, + .EXT_physical_device_drm = true, + .EXT_pipeline_creation_cache_control = true, + .EXT_pipeline_creation_feedback = true, + .EXT_post_depth_coverage = device->info.ver >= 9, + .EXT_primitives_generated_query = true, + .EXT_primitive_topology_list_restart = true, + .EXT_private_data = true, + .EXT_provoking_vertex = true, + .EXT_queue_family_foreign = true, + .EXT_robustness2 = true, + .EXT_sample_locations = true, + .EXT_sampler_filter_minmax = device->info.ver >= 9, + .EXT_scalar_block_layout = true, + .EXT_separate_stencil_usage = true, + .EXT_shader_atomic_float = true, + .EXT_shader_atomic_float2 = device->info.ver >= 9, + .EXT_shader_demote_to_helper_invocation = true, + .EXT_shader_module_identifier = true, + .EXT_shader_stencil_export = device->info.ver >= 9, + .EXT_shader_subgroup_ballot = true, + .EXT_shader_subgroup_vote = true, + .EXT_shader_viewport_index_layer = true, + .EXT_subgroup_size_control = true, + .EXT_texel_buffer_alignment = true, + .EXT_tooling_info = true, + .EXT_transform_feedback = true, + .EXT_vertex_attribute_divisor = true, + .EXT_ycbcr_image_arrays = true, +#ifdef ANDROID + .ANDROID_external_memory_android_hardware_buffer = true, + .ANDROID_native_buffer = true, +#endif + .GOOGLE_decorate_string = true, + .GOOGLE_hlsl_functionality1 = true, + .GOOGLE_user_type = true, + .INTEL_performance_query = device->perf && + device->perf->i915_perf_version >= 3, + .INTEL_shader_integer_functions2 = device->info.ver >= 8, + .EXT_multi_draw = true, + .NV_compute_shader_derivatives = true, + .NV_mesh_shader = device->info.has_mesh_shading && + nv_mesh_shading_enabled, + .VALVE_mutable_descriptor_type = true, + }; +} + +static uint64_t +anv_compute_sys_heap_size(struct anv_physical_device *device, + uint64_t total_ram) +{ + /* We don't want to burn too much ram with the GPU. If the user has 4GiB + * or less, we use at most half. If they have more than 4GiB, we use 3/4. + */ + uint64_t available_ram; + if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull) + available_ram = total_ram / 2; + else + available_ram = total_ram * 3 / 4; + + /* We also want to leave some padding for things we allocate in the driver, + * so don't go over 3/4 of the GTT either. + */ + available_ram = MIN2(available_ram, device->gtt_size * 3 / 4); + + if (available_ram > (2ull << 30) && !device->supports_48bit_addresses) { + /* When running with an overridden PCI ID, we may get a GTT size from + * the kernel that is greater than 2 GiB but the execbuf check for 48bit + * address support can still fail. Just clamp the address space size to + * 2 GiB if we don't have 48-bit support. + */ + mesa_logw("%s:%d: The kernel reported a GTT size larger than 2 GiB but " + "not support for 48-bit addresses", + __FILE__, __LINE__); + available_ram = 2ull << 30; + } + + return available_ram; +} + +static VkResult MUST_CHECK +anv_init_meminfo(struct anv_physical_device *device, int fd) +{ + const struct intel_device_info *devinfo = &device->info; + + device->sys.region.memory_class = devinfo->mem.sram.mem_class; + device->sys.region.memory_instance = devinfo->mem.sram.mem_instance; + device->sys.size = + anv_compute_sys_heap_size(device, devinfo->mem.sram.mappable.size); + device->sys.available = devinfo->mem.sram.mappable.free; + + device->vram_mappable.region.memory_class = devinfo->mem.vram.mem_class; + device->vram_mappable.region.memory_instance = + devinfo->mem.vram.mem_instance; + device->vram_mappable.size = devinfo->mem.vram.mappable.size; + device->vram_mappable.available = devinfo->mem.vram.mappable.free; + + device->vram_non_mappable.region.memory_class = + devinfo->mem.vram.mem_class; + device->vram_non_mappable.region.memory_instance = + devinfo->mem.vram.mem_instance; + device->vram_non_mappable.size = devinfo->mem.vram.unmappable.size; + device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free; + + return VK_SUCCESS; +} + +static void +anv_update_meminfo(struct anv_physical_device *device, int fd) +{ + if (!intel_device_info_update_memory_info(&device->info, fd)) + return; + + const struct intel_device_info *devinfo = &device->info; + device->sys.available = devinfo->mem.sram.mappable.free; + device->vram_mappable.available = devinfo->mem.vram.mappable.free; + device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free; +} + + +static VkResult +anv_physical_device_init_heaps(struct anv_physical_device *device, int fd) +{ + VkResult result = anv_init_meminfo(device, fd); + if (result != VK_SUCCESS) + return result; + + assert(device->sys.size != 0); + + if (anv_physical_device_has_vram(device)) { + /* We can create 2 or 3 different heaps when we have local memory + * support, first heap with local memory size and second with system + * memory size and the third is added only if part of the vram is + * mappable to the host. + */ + device->memory.heap_count = 2; + device->memory.heaps[0] = (struct anv_memory_heap) { + /* If there is a vram_non_mappable, use that for the device only + * heap. Otherwise use the vram_mappable. + */ + .size = device->vram_non_mappable.size != 0 ? + device->vram_non_mappable.size : device->vram_mappable.size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + .is_local_mem = true, + }; + device->memory.heaps[1] = (struct anv_memory_heap) { + .size = device->sys.size, + .flags = 0, + .is_local_mem = false, + }; + /* Add an additional smaller vram mappable heap if we can't map all the + * vram to the host. + */ + if (device->vram_non_mappable.size > 0) { + device->memory.heap_count++; + device->memory.heaps[2] = (struct anv_memory_heap) { + .size = device->vram_mappable.size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + .is_local_mem = true, + }; + } + + device->memory.type_count = 3; + device->memory.types[0] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .heapIndex = 0, + }; + device->memory.types[1] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + .heapIndex = 1, + }; + device->memory.types[2] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + /* This memory type either comes from heaps[0] if there is only + * mappable vram region, or from heaps[2] if there is both mappable & + * non-mappable vram regions. + */ + .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0, + }; + } else if (device->info.has_llc) { + device->memory.heap_count = 1; + device->memory.heaps[0] = (struct anv_memory_heap) { + .size = device->sys.size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + .is_local_mem = false, + }; + + /* Big core GPUs share LLC with the CPU and thus one memory type can be + * both cached and coherent at the same time. + */ + device->memory.type_count = 1; + device->memory.types[0] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + .heapIndex = 0, + }; + } else { + device->memory.heap_count = 1; + device->memory.heaps[0] = (struct anv_memory_heap) { + .size = device->sys.size, + .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, + .is_local_mem = false, + }; + + /* The spec requires that we expose a host-visible, coherent memory + * type, but Atom GPUs don't share LLC. Thus we offer two memory types + * to give the application a choice between cached, but not coherent and + * coherent but uncached (WC though). + */ + device->memory.type_count = 2; + device->memory.types[0] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT, + .heapIndex = 0, + }; + device->memory.types[1] = (struct anv_memory_type) { + .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + .heapIndex = 0, + }; + } + + device->memory.need_clflush = false; + for (unsigned i = 0; i < device->memory.type_count; i++) { + VkMemoryPropertyFlags props = device->memory.types[i].propertyFlags; + if ((props & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) && + !(props & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) + device->memory.need_clflush = true; + } + + return VK_SUCCESS; +} + +static VkResult +anv_physical_device_init_uuids(struct anv_physical_device *device) +{ + const struct build_id_note *note = + build_id_find_nhdr_for_addr(anv_physical_device_init_uuids); + if (!note) { + return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "Failed to find build-id"); + } + + unsigned build_id_len = build_id_length(note); + if (build_id_len < 20) { + return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "build-id too short. It needs to be a SHA"); + } + + memcpy(device->driver_build_sha1, build_id_data(note), 20); + + struct mesa_sha1 sha1_ctx; + uint8_t sha1[20]; + STATIC_ASSERT(VK_UUID_SIZE <= sizeof(sha1)); + + /* The pipeline cache UUID is used for determining when a pipeline cache is + * invalid. It needs both a driver build and the PCI ID of the device. + */ + _mesa_sha1_init(&sha1_ctx); + _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len); + _mesa_sha1_update(&sha1_ctx, &device->info.pci_device_id, + sizeof(device->info.pci_device_id)); + _mesa_sha1_update(&sha1_ctx, &device->always_use_bindless, + sizeof(device->always_use_bindless)); + _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access, + sizeof(device->has_a64_buffer_access)); + _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images, + sizeof(device->has_bindless_images)); + _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers, + sizeof(device->has_bindless_samplers)); + _mesa_sha1_final(&sha1_ctx, sha1); + memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE); + + intel_uuid_compute_driver_id(device->driver_uuid, &device->info, VK_UUID_SIZE); + intel_uuid_compute_device_id(device->device_uuid, &device->info, VK_UUID_SIZE); + + return VK_SUCCESS; +} + +static void +anv_physical_device_init_disk_cache(struct anv_physical_device *device) +{ +#ifdef ENABLE_SHADER_CACHE + char renderer[10]; + ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x", + device->info.pci_device_id); + assert(len == sizeof(renderer) - 2); + + char timestamp[41]; + _mesa_sha1_format(timestamp, device->driver_build_sha1); + + const uint64_t driver_flags = + brw_get_compiler_config_value(device->compiler); + device->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags); +#endif +} + +static void +anv_physical_device_free_disk_cache(struct anv_physical_device *device) +{ +#ifdef ENABLE_SHADER_CACHE + if (device->vk.disk_cache) { + disk_cache_destroy(device->vk.disk_cache); + device->vk.disk_cache = NULL; + } +#else + assert(device->vk.disk_cache == NULL); +#endif +} + +/* The ANV_QUEUE_OVERRIDE environment variable is a comma separated list of + * queue overrides. + * + * To override the number queues: + * * "gc" is for graphics queues with compute support + * * "g" is for graphics queues with no compute support + * * "c" is for compute queues with no graphics support + * + * For example, ANV_QUEUE_OVERRIDE=gc=2,c=1 would override the number of + * advertised queues to be 2 queues with graphics+compute support, and 1 queue + * with compute-only support. + * + * ANV_QUEUE_OVERRIDE=c=1 would override the number of advertised queues to + * include 1 queue with compute-only support, but it will not change the + * number of graphics+compute queues. + * + * ANV_QUEUE_OVERRIDE=gc=0,c=1 would override the number of advertised queues + * to include 1 queue with compute-only support, and it would override the + * number of graphics+compute queues to be 0. + */ +static void +anv_override_engine_counts(int *gc_count, int *g_count, int *c_count) +{ + int gc_override = -1; + int g_override = -1; + int c_override = -1; + char *env = getenv("ANV_QUEUE_OVERRIDE"); + + if (env == NULL) + return; + + env = strdup(env); + char *save = NULL; + char *next = strtok_r(env, ",", &save); + while (next != NULL) { + if (strncmp(next, "gc=", 3) == 0) { + gc_override = strtol(next + 3, NULL, 0); + } else if (strncmp(next, "g=", 2) == 0) { + g_override = strtol(next + 2, NULL, 0); + } else if (strncmp(next, "c=", 2) == 0) { + c_override = strtol(next + 2, NULL, 0); + } else { + mesa_logw("Ignoring unsupported ANV_QUEUE_OVERRIDE token: %s", next); + } + next = strtok_r(NULL, ",", &save); + } + free(env); + if (gc_override >= 0) + *gc_count = gc_override; + if (g_override >= 0) + *g_count = g_override; + if (*g_count > 0 && *gc_count <= 0 && (gc_override >= 0 || g_override >= 0)) + mesa_logw("ANV_QUEUE_OVERRIDE: gc=0 with g > 0 violates the " + "Vulkan specification"); + if (c_override >= 0) + *c_count = c_override; +} + +static void +anv_physical_device_init_queue_families(struct anv_physical_device *pdevice) +{ + uint32_t family_count = 0; + + if (pdevice->engine_info) { + int gc_count = + intel_gem_count_engines(pdevice->engine_info, + I915_ENGINE_CLASS_RENDER); + int g_count = 0; + int c_count = 0; + if (env_var_as_boolean("INTEL_COMPUTE_CLASS", false)) + c_count = intel_gem_count_engines(pdevice->engine_info, + I915_ENGINE_CLASS_COMPUTE); + enum drm_i915_gem_engine_class compute_class = + c_count < 1 ? I915_ENGINE_CLASS_RENDER : I915_ENGINE_CLASS_COMPUTE; + + anv_override_engine_counts(&gc_count, &g_count, &c_count); + + if (gc_count > 0) { + pdevice->queue.families[family_count++] = (struct anv_queue_family) { + .queueFlags = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = gc_count, + .engine_class = I915_ENGINE_CLASS_RENDER, + }; + } + if (g_count > 0) { + pdevice->queue.families[family_count++] = (struct anv_queue_family) { + .queueFlags = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = g_count, + .engine_class = I915_ENGINE_CLASS_RENDER, + }; + } + if (c_count > 0) { + pdevice->queue.families[family_count++] = (struct anv_queue_family) { + .queueFlags = VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = c_count, + .engine_class = compute_class, + }; + } + /* Increase count below when other families are added as a reminder to + * increase the ANV_MAX_QUEUE_FAMILIES value. + */ + STATIC_ASSERT(ANV_MAX_QUEUE_FAMILIES >= 3); + } else { + /* Default to a single render queue */ + pdevice->queue.families[family_count++] = (struct anv_queue_family) { + .queueFlags = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT | + VK_QUEUE_TRANSFER_BIT, + .queueCount = 1, + .engine_class = I915_ENGINE_CLASS_RENDER, + }; + family_count = 1; + } + assert(family_count <= ANV_MAX_QUEUE_FAMILIES); + pdevice->queue.family_count = family_count; +} + +static VkResult +anv_physical_device_try_create(struct vk_instance *vk_instance, + struct _drmDevice *drm_device, + struct vk_physical_device **out) +{ + struct anv_instance *instance = + container_of(vk_instance, struct anv_instance, vk); + + if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) || + drm_device->bustype != DRM_BUS_PCI || + drm_device->deviceinfo.pci->vendor_id != 0x8086) + return VK_ERROR_INCOMPATIBLE_DRIVER; + + const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY]; + const char *path = drm_device->nodes[DRM_NODE_RENDER]; + VkResult result; + int fd; + int master_fd = -1; + + brw_process_intel_debug_variable(); + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + if (errno == ENOMEM) { + return vk_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY, + "Unable to open device %s: out of memory", path); + } + return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "Unable to open device %s: %m", path); + } + + struct intel_device_info devinfo; + if (!intel_get_device_info_from_fd(fd, &devinfo)) { + result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER); + goto fail_fd; + } + + bool is_alpha = true; + if (devinfo.platform == INTEL_PLATFORM_HSW) { + mesa_logw("Haswell Vulkan support is incomplete"); + } else if (devinfo.platform == INTEL_PLATFORM_IVB) { + mesa_logw("Ivy Bridge Vulkan support is incomplete"); + } else if (devinfo.platform == INTEL_PLATFORM_BYT) { + mesa_logw("Bay Trail Vulkan support is incomplete"); + } else if (devinfo.ver == 8) { + /* Gfx8 fully supported */ + is_alpha = false; + } else { + result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "Vulkan not yet supported on %s", devinfo.name); + goto fail_fd; + } + + struct anv_physical_device *device = + vk_zalloc(&instance->vk.alloc, sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (device == NULL) { + result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_fd; + } + + struct vk_physical_device_dispatch_table dispatch_table; + vk_physical_device_dispatch_table_from_entrypoints( + &dispatch_table, &anv_physical_device_entrypoints, true); + vk_physical_device_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_physical_device_entrypoints, false); + + result = vk_physical_device_init(&device->vk, &instance->vk, + NULL, /* We set up extensions later */ + &dispatch_table); + if (result != VK_SUCCESS) { + vk_error(instance, result); + goto fail_alloc; + } + device->instance = instance; + + assert(strlen(path) < ARRAY_SIZE(device->path)); + snprintf(device->path, ARRAY_SIZE(device->path), "%s", path); + + device->info = devinfo; + device->is_alpha = is_alpha; + + device->cmd_parser_version = -1; + if (device->info.ver == 7) { + device->cmd_parser_version = + anv_gem_get_param(fd, I915_PARAM_CMD_PARSER_VERSION); + if (device->cmd_parser_version == -1) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "failed to get command parser version"); + goto fail_base; + } + } + + if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "kernel missing gem wait"); + goto fail_base; + } + + if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "kernel missing execbuf2"); + goto fail_base; + } + + if (!device->info.has_llc && + anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "kernel missing wc mmap"); + goto fail_base; + } + + device->use_relocations = device->info.ver < 8 || + device->info.platform == INTEL_PLATFORM_CHV; + + if (!device->use_relocations && + !anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "kernel missing softpin"); + goto fail_alloc; + } + + if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY)) { + result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED, + "kernel missing syncobj support"); + goto fail_base; + } + + device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC); + device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE); + + /* Start with medium; sorted low to high */ + const int priorities[] = { + INTEL_CONTEXT_MEDIUM_PRIORITY, + INTEL_CONTEXT_HIGH_PRIORITY, + INTEL_CONTEXT_REALTIME_PRIORITY, + }; + device->max_context_priority = INT_MIN; + for (unsigned i = 0; i < ARRAY_SIZE(priorities); i++) { + if (!anv_gem_has_context_priority(fd, priorities[i])) + break; + device->max_context_priority = priorities[i]; + } + + device->gtt_size = device->info.gtt_size ? device->info.gtt_size : + device->info.aperture_bytes; + + /* We only allow 48-bit addresses with softpin because knowing the actual + * address is required for the vertex cache flush workaround. + */ + device->supports_48bit_addresses = (device->info.ver >= 8) && + device->gtt_size > (4ULL << 30 /* GiB */); + + result = anv_physical_device_init_heaps(device, fd); + if (result != VK_SUCCESS) + goto fail_base; + + assert(device->supports_48bit_addresses == !device->use_relocations); + device->use_softpin = !device->use_relocations; + + device->has_context_isolation = + anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION); + + device->has_exec_timeline = + anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES); + if (env_var_as_boolean("ANV_QUEUE_THREAD_DISABLE", false)) + device->has_exec_timeline = false; + + unsigned st_idx = 0; + + device->sync_syncobj_type = vk_drm_syncobj_get_type(fd); + if (!device->has_exec_timeline) + device->sync_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE; + device->sync_types[st_idx++] = &device->sync_syncobj_type; + + if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT)) + device->sync_types[st_idx++] = &anv_bo_sync_type; + + if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) { + device->sync_timeline_type = vk_sync_timeline_get_type(&anv_bo_sync_type); + device->sync_types[st_idx++] = &device->sync_timeline_type.sync; + } + + device->sync_types[st_idx++] = NULL; + assert(st_idx <= ARRAY_SIZE(device->sync_types)); + device->vk.supported_sync_types = device->sync_types; + + device->vk.pipeline_cache_import_ops = anv_cache_import_ops; + + device->always_use_bindless = + env_var_as_boolean("ANV_ALWAYS_BINDLESS", false); + + device->use_call_secondary = + device->use_softpin && + !env_var_as_boolean("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false); + + /* We first got the A64 messages on broadwell and we can only use them if + * we can pass addresses directly into the shader which requires softpin. + */ + device->has_a64_buffer_access = device->info.ver >= 8 && + device->use_softpin; + + /* We first get bindless image access on Skylake. + */ + device->has_bindless_images = device->info.ver >= 9; + + /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms) + * because it's just a matter of setting the sampler address in the sample + * message header. However, we've not bothered to wire it up for vec4 so + * we leave it disabled on gfx7. + */ + device->has_bindless_samplers = device->info.ver >= 8; + + device->has_implicit_ccs = device->info.has_aux_map || + device->info.verx10 >= 125; + + /* Check if we can read the GPU timestamp register from the CPU */ + uint64_t u64_ignore; + device->has_reg_timestamp = anv_gem_reg_read(fd, TIMESTAMP | I915_REG_READ_8B_WA, + &u64_ignore) == 0; + + device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) || + driQueryOptionb(&instance->dri_options, "always_flush_cache"); + + device->has_mmap_offset = + anv_gem_get_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4; + + device->has_userptr_probe = + anv_gem_get_param(fd, I915_PARAM_HAS_USERPTR_PROBE); + + device->compiler = brw_compiler_create(NULL, &device->info); + if (device->compiler == NULL) { + result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_base; + } + device->compiler->shader_debug_log = compiler_debug_log; + device->compiler->shader_perf_log = compiler_perf_log; + device->compiler->constant_buffer_0_is_relative = + device->info.ver < 8 || !device->has_context_isolation; + device->compiler->supports_shader_constants = true; + device->compiler->indirect_ubos_use_sampler = device->info.ver < 12; + + isl_device_init(&device->isl_dev, &device->info); + + result = anv_physical_device_init_uuids(device); + if (result != VK_SUCCESS) + goto fail_compiler; + + anv_physical_device_init_disk_cache(device); + + if (instance->vk.enabled_extensions.KHR_display) { + master_fd = open(primary_path, O_RDWR | O_CLOEXEC); + if (master_fd >= 0) { + /* prod the device with a GETPARAM call which will fail if + * we don't have permission to even render on this device + */ + if (anv_gem_get_param(master_fd, I915_PARAM_CHIPSET_ID) == 0) { + close(master_fd); + master_fd = -1; + } + } + } + device->master_fd = master_fd; + + device->engine_info = anv_gem_get_engine_info(fd); + anv_physical_device_init_queue_families(device); + + device->local_fd = fd; + + anv_physical_device_init_perf(device, fd); + + get_device_extensions(device, &device->vk.supported_extensions); + + result = anv_init_wsi(device); + if (result != VK_SUCCESS) + goto fail_perf; + + anv_measure_device_init(device); + + anv_genX(&device->info, init_physical_device_state)(device); + + *out = &device->vk; + + struct stat st; + + if (stat(primary_path, &st) == 0) { + device->has_master = true; + device->master_major = major(st.st_rdev); + device->master_minor = minor(st.st_rdev); + } else { + device->has_master = false; + device->master_major = 0; + device->master_minor = 0; + } + + if (stat(path, &st) == 0) { + device->has_local = true; + device->local_major = major(st.st_rdev); + device->local_minor = minor(st.st_rdev); + } else { + device->has_local = false; + device->local_major = 0; + device->local_minor = 0; + } + + return VK_SUCCESS; + +fail_perf: + ralloc_free(device->perf); + free(device->engine_info); + anv_physical_device_free_disk_cache(device); +fail_compiler: + ralloc_free(device->compiler); +fail_base: + vk_physical_device_finish(&device->vk); +fail_alloc: + vk_free(&instance->vk.alloc, device); +fail_fd: + close(fd); + if (master_fd != -1) + close(master_fd); + return result; +} + +static void +anv_physical_device_destroy(struct vk_physical_device *vk_device) +{ + struct anv_physical_device *device = + container_of(vk_device, struct anv_physical_device, vk); + + anv_finish_wsi(device); + anv_measure_device_destroy(device); + free(device->engine_info); + anv_physical_device_free_disk_cache(device); + ralloc_free(device->compiler); + ralloc_free(device->perf); + close(device->local_fd); + if (device->master_fd >= 0) + close(device->master_fd); + vk_physical_device_finish(&device->vk); + vk_free(&device->instance->vk.alloc, device); +} + +VkResult anv_EnumerateInstanceExtensionProperties( + const char* pLayerName, + uint32_t* pPropertyCount, + VkExtensionProperties* pProperties) +{ + if (pLayerName) + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); + + return vk_enumerate_instance_extension_properties( + &instance_extensions, pPropertyCount, pProperties); +} + +static void +anv_init_dri_options(struct anv_instance *instance) +{ + driParseOptionInfo(&instance->available_dri_options, anv_dri_options, + ARRAY_SIZE(anv_dri_options)); + driParseConfigFiles(&instance->dri_options, + &instance->available_dri_options, 0, "anv", NULL, NULL, + instance->vk.app_info.app_name, + instance->vk.app_info.app_version, + instance->vk.app_info.engine_name, + instance->vk.app_info.engine_version); + + instance->assume_full_subgroups = + driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups"); + instance->limit_trig_input_range = + driQueryOptionb(&instance->dri_options, "limit_trig_input_range"); + instance->sample_mask_out_opengl_behaviour = + driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour"); +} + +VkResult anv_CreateInstance( + const VkInstanceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkInstance* pInstance) +{ + struct anv_instance *instance; + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO); + + if (pAllocator == NULL) + pAllocator = vk_default_allocator(); + + instance = vk_alloc(pAllocator, sizeof(*instance), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!instance) + return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct vk_instance_dispatch_table dispatch_table; + vk_instance_dispatch_table_from_entrypoints( + &dispatch_table, &anv_instance_entrypoints, true); + vk_instance_dispatch_table_from_entrypoints( + &dispatch_table, &wsi_instance_entrypoints, false); + + result = vk_instance_init(&instance->vk, &instance_extensions, + &dispatch_table, pCreateInfo, pAllocator); + if (result != VK_SUCCESS) { + vk_free(pAllocator, instance); + return vk_error(NULL, result); + } + + instance->vk.physical_devices.try_create_for_drm = anv_physical_device_try_create; + instance->vk.physical_devices.destroy = anv_physical_device_destroy; + + VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false)); + + anv_init_dri_options(instance); + + intel_driver_ds_init(); + + *pInstance = anv_instance_to_handle(instance); + + return VK_SUCCESS; +} + +void anv_DestroyInstance( + VkInstance _instance, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_instance, instance, _instance); + + if (!instance) + return; + + VG(VALGRIND_DESTROY_MEMPOOL(instance)); + + driDestroyOptionCache(&instance->dri_options); + driDestroyOptionInfo(&instance->available_dri_options); + + vk_instance_finish(&instance->vk); + vk_free(&instance->vk.alloc, instance); +} + +void anv_GetPhysicalDeviceFeatures( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures* pFeatures) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + + /* Just pick one; they're all the same */ + const bool has_astc_ldr = + isl_format_supports_sampling(&pdevice->info, + ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16); + + *pFeatures = (VkPhysicalDeviceFeatures) { + .robustBufferAccess = true, + .fullDrawIndexUint32 = true, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = true, + .sampleRateShading = true, + .dualSrcBlend = true, + .logicOp = true, + .multiDrawIndirect = true, + .drawIndirectFirstInstance = true, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = true, + .depthBounds = pdevice->info.ver >= 12, + .wideLines = true, + .largePoints = true, + .alphaToOne = true, + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = pdevice->info.ver >= 8 || + pdevice->info.platform == INTEL_PLATFORM_BYT, + .textureCompressionASTC_LDR = has_astc_ldr, + .textureCompressionBC = true, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = true, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = true, + .shaderStorageImageMultisample = false, + .shaderStorageImageReadWithoutFormat = false, + .shaderStorageImageWriteWithoutFormat = true, + .shaderUniformBufferArrayDynamicIndexing = true, + .shaderSampledImageArrayDynamicIndexing = true, + .shaderStorageBufferArrayDynamicIndexing = true, + .shaderStorageImageArrayDynamicIndexing = true, + .shaderClipDistance = true, + .shaderCullDistance = true, + .shaderFloat64 = pdevice->info.ver >= 8 && + pdevice->info.has_64bit_float, + .shaderInt64 = pdevice->info.ver >= 8, + .shaderInt16 = pdevice->info.ver >= 8, + .shaderResourceMinLod = pdevice->info.ver >= 9, + .variableMultisampleRate = true, + .inheritedQueries = true, + }; + + /* We can't do image stores in vec4 shaders */ + pFeatures->vertexPipelineStoresAndAtomics = + pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] && + pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY]; + + struct vk_app_info *app_info = &pdevice->instance->vk.app_info; + + /* The new DOOM and Wolfenstein games require depthBounds without + * checking for it. They seem to run fine without it so just claim it's + * there and accept the consequences. + */ + if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0) + pFeatures->depthBounds = true; +} + +static void +anv_get_physical_device_features_1_1(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan11Features *f) +{ + assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES); + + f->storageBuffer16BitAccess = pdevice->info.ver >= 8; + f->uniformAndStorageBuffer16BitAccess = pdevice->info.ver >= 8; + f->storagePushConstant16 = pdevice->info.ver >= 8; + f->storageInputOutput16 = false; + f->multiview = true; + f->multiviewGeometryShader = true; + f->multiviewTessellationShader = true; + f->variablePointersStorageBuffer = true; + f->variablePointers = true; + f->protectedMemory = false; + f->samplerYcbcrConversion = true; + f->shaderDrawParameters = true; +} + +static void +anv_get_physical_device_features_1_2(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan12Features *f) +{ + assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES); + + f->samplerMirrorClampToEdge = true; + f->drawIndirectCount = true; + f->storageBuffer8BitAccess = pdevice->info.ver >= 8; + f->uniformAndStorageBuffer8BitAccess = pdevice->info.ver >= 8; + f->storagePushConstant8 = pdevice->info.ver >= 8; + f->shaderBufferInt64Atomics = pdevice->info.ver >= 9; + f->shaderSharedInt64Atomics = false; + f->shaderFloat16 = pdevice->info.ver >= 8; + f->shaderInt8 = pdevice->info.ver >= 8; + + bool descIndexing = pdevice->has_a64_buffer_access && + pdevice->has_bindless_images; + f->descriptorIndexing = descIndexing; + f->shaderInputAttachmentArrayDynamicIndexing = false; + f->shaderUniformTexelBufferArrayDynamicIndexing = descIndexing; + f->shaderStorageTexelBufferArrayDynamicIndexing = descIndexing; + f->shaderUniformBufferArrayNonUniformIndexing = false; + f->shaderSampledImageArrayNonUniformIndexing = descIndexing; + f->shaderStorageBufferArrayNonUniformIndexing = descIndexing; + f->shaderStorageImageArrayNonUniformIndexing = descIndexing; + f->shaderInputAttachmentArrayNonUniformIndexing = false; + f->shaderUniformTexelBufferArrayNonUniformIndexing = descIndexing; + f->shaderStorageTexelBufferArrayNonUniformIndexing = descIndexing; + f->descriptorBindingUniformBufferUpdateAfterBind = descIndexing; + f->descriptorBindingSampledImageUpdateAfterBind = descIndexing; + f->descriptorBindingStorageImageUpdateAfterBind = descIndexing; + f->descriptorBindingStorageBufferUpdateAfterBind = descIndexing; + f->descriptorBindingUniformTexelBufferUpdateAfterBind = descIndexing; + f->descriptorBindingStorageTexelBufferUpdateAfterBind = descIndexing; + f->descriptorBindingUpdateUnusedWhilePending = descIndexing; + f->descriptorBindingPartiallyBound = descIndexing; + f->descriptorBindingVariableDescriptorCount = descIndexing; + f->runtimeDescriptorArray = descIndexing; + + f->samplerFilterMinmax = pdevice->info.ver >= 9; + f->scalarBlockLayout = true; + f->imagelessFramebuffer = true; + f->uniformBufferStandardLayout = true; + f->shaderSubgroupExtendedTypes = true; + f->separateDepthStencilLayouts = true; + f->hostQueryReset = true; + f->timelineSemaphore = true; + f->bufferDeviceAddress = pdevice->has_a64_buffer_access; + f->bufferDeviceAddressCaptureReplay = pdevice->has_a64_buffer_access; + f->bufferDeviceAddressMultiDevice = false; + f->vulkanMemoryModel = true; + f->vulkanMemoryModelDeviceScope = true; + f->vulkanMemoryModelAvailabilityVisibilityChains = true; + f->shaderOutputViewportIndex = true; + f->shaderOutputLayer = true; + f->subgroupBroadcastDynamicId = true; +} + +static void +anv_get_physical_device_features_1_3(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan13Features *f) +{ + assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES); + + f->robustImageAccess = true; + f->inlineUniformBlock = true; + f->descriptorBindingInlineUniformBlockUpdateAfterBind = true; + f->pipelineCreationCacheControl = true; + f->privateData = true; + f->shaderDemoteToHelperInvocation = true; + f->shaderTerminateInvocation = true; + f->subgroupSizeControl = true; + f->computeFullSubgroups = true; + f->synchronization2 = true; + f->textureCompressionASTC_HDR = false; + f->shaderZeroInitializeWorkgroupMemory = true; + f->dynamicRendering = true; + f->shaderIntegerDotProduct = true; + f->maintenance4 = true; +} + +void anv_GetPhysicalDeviceFeatures2( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2* pFeatures) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + anv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); + + VkPhysicalDeviceVulkan11Features core_1_1 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + }; + anv_get_physical_device_features_1_1(pdevice, &core_1_1); + + VkPhysicalDeviceVulkan12Features core_1_2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + }; + anv_get_physical_device_features_1_2(pdevice, &core_1_2); + + VkPhysicalDeviceVulkan13Features core_1_3 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, + }; + anv_get_physical_device_features_1_3(pdevice, &core_1_3); + + vk_foreach_struct(ext, pFeatures->pNext) { + if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1)) + continue; + if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2)) + continue; + if (vk_get_physical_device_core_1_3_feature_ext(ext, &core_1_3)) + continue; + + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: { + VkPhysicalDevice4444FormatsFeaturesEXT *features = + (VkPhysicalDevice4444FormatsFeaturesEXT *)ext; + features->formatA4R4G4B4 = true; + features->formatA4B4G4R4 = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: { + VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext; + features->accelerationStructure = false; + features->accelerationStructureCaptureReplay = false; + features->accelerationStructureIndirectBuild = false; + features->accelerationStructureHostCommands = false; + features->descriptorBindingAccelerationStructureUpdateAfterBind = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: { + VkPhysicalDeviceBufferDeviceAddressFeaturesEXT *features = (void *)ext; + features->bufferDeviceAddress = pdevice->has_a64_buffer_access; + features->bufferDeviceAddressCaptureReplay = false; + features->bufferDeviceAddressMultiDevice = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BORDER_COLOR_SWIZZLE_FEATURES_EXT: { + VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *features = + (VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *)ext; + features->borderColorSwizzle = true; + features->borderColorSwizzleFromImage = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: { + VkPhysicalDeviceColorWriteEnableFeaturesEXT *features = + (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext; + features->colorWriteEnable = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_2D_VIEW_OF_3D_FEATURES_EXT: { + VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *features = + (VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *)ext; + features->image2DViewOf3D = true; + features->sampler2DViewOf3D = pdevice->info.ver >= 9; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: { + VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features = + (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext; + features->computeDerivativeGroupQuads = true; + features->computeDerivativeGroupLinear = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: { + VkPhysicalDeviceConditionalRenderingFeaturesEXT *features = + (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext; + features->conditionalRendering = pdevice->info.verx10 >= 75; + features->inheritedConditionalRendering = pdevice->info.verx10 >= 75; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: { + VkPhysicalDeviceCustomBorderColorFeaturesEXT *features = + (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext; + features->customBorderColors = pdevice->info.ver >= 8; + features->customBorderColorWithoutFormat = pdevice->info.ver >= 8; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: { + VkPhysicalDeviceDepthClipEnableFeaturesEXT *features = + (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext; + features->depthClipEnable = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT: { + VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *features = + (VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *)ext; + features->fragmentShaderSampleInterlock = pdevice->info.ver >= 9; + features->fragmentShaderPixelInterlock = pdevice->info.ver >= 9; + features->fragmentShaderShadingRateInterlock = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: { + VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features = + (VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *)ext; + features->globalPriorityQuery = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: { + VkPhysicalDeviceFragmentShadingRateFeaturesKHR *features = + (VkPhysicalDeviceFragmentShadingRateFeaturesKHR *)ext; + features->attachmentFragmentShadingRate = false; + features->pipelineFragmentShadingRate = true; + features->primitiveFragmentShadingRate = + pdevice->info.has_coarse_pixel_primitive_and_cb; + features->attachmentFragmentShadingRate = + pdevice->info.has_coarse_pixel_primitive_and_cb; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_MIN_LOD_FEATURES_EXT: { + VkPhysicalDeviceImageViewMinLodFeaturesEXT *features = + (VkPhysicalDeviceImageViewMinLodFeaturesEXT *)ext; + features->minLod = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: { + VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features = + (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext; + features->indexTypeUint8 = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: { + VkPhysicalDeviceLineRasterizationFeaturesEXT *features = + (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext; + /* Rectangular lines must use the strict algorithm, which is not + * supported for wide lines prior to ICL. See rasterization_mode for + * details and how the HW states are programmed. + */ + features->rectangularLines = pdevice->info.ver >= 10; + features->bresenhamLines = true; + /* Support for Smooth lines with MSAA was removed on gfx11. From the + * BSpec section "Multisample ModesState" table for "AA Line Support + * Requirements": + * + * GFX10:BUG:######## NUM_MULTISAMPLES == 1 + * + * Fortunately, this isn't a case most people care about. + */ + features->smoothLines = pdevice->info.ver < 10; + features->stippledRectangularLines = false; + features->stippledBresenhamLines = true; + features->stippledSmoothLines = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_NV: { + VkPhysicalDeviceMeshShaderFeaturesNV *features = + (VkPhysicalDeviceMeshShaderFeaturesNV *)ext; + features->taskShader = pdevice->vk.supported_extensions.NV_mesh_shader; + features->meshShader = pdevice->vk.supported_extensions.NV_mesh_shader; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_VALVE: { + VkPhysicalDeviceMutableDescriptorTypeFeaturesVALVE *features = + (VkPhysicalDeviceMutableDescriptorTypeFeaturesVALVE *)ext; + features->mutableDescriptorType = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: { + VkPhysicalDevicePerformanceQueryFeaturesKHR *feature = + (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext; + feature->performanceCounterQueryPools = true; + /* HW only supports a single configuration at a time. */ + feature->performanceCounterMultipleQueryPools = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: { + VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features = + (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext; + features->pipelineExecutableInfo = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVES_GENERATED_QUERY_FEATURES_EXT: { + VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *features = + (VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *)ext; + features->primitivesGeneratedQuery = true; + features->primitivesGeneratedQueryWithRasterizerDiscard = false; + features->primitivesGeneratedQueryWithNonZeroStreams = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: { + VkPhysicalDeviceProvokingVertexFeaturesEXT *features = + (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext; + features->provokingVertexLast = true; + features->transformFeedbackPreservesProvokingVertex = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR: { + VkPhysicalDeviceRayQueryFeaturesKHR *features = (void *)ext; + features->rayQuery = pdevice->info.has_ray_tracing; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: { + VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext; + features->robustBufferAccess2 = true; + features->robustImageAccess2 = true; + features->nullDescriptor = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT: { + VkPhysicalDeviceShaderAtomicFloatFeaturesEXT *features = (void *)ext; + features->shaderBufferFloat32Atomics = true; + features->shaderBufferFloat32AtomicAdd = pdevice->info.has_lsc; + features->shaderBufferFloat64Atomics = + pdevice->info.has_64bit_float && pdevice->info.has_lsc; + features->shaderBufferFloat64AtomicAdd = false; + features->shaderSharedFloat32Atomics = true; + features->shaderSharedFloat32AtomicAdd = false; + features->shaderSharedFloat64Atomics = false; + features->shaderSharedFloat64AtomicAdd = false; + features->shaderImageFloat32Atomics = true; + features->shaderImageFloat32AtomicAdd = false; + features->sparseImageFloat32Atomics = false; + features->sparseImageFloat32AtomicAdd = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: { + VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features = (void *)ext; + features->shaderBufferFloat16Atomics = false; + features->shaderBufferFloat16AtomicAdd = false; + features->shaderBufferFloat16AtomicMinMax = false; + features->shaderBufferFloat32AtomicMinMax = pdevice->info.ver >= 9; + features->shaderBufferFloat64AtomicMinMax = + pdevice->info.has_64bit_float && pdevice->info.has_lsc; + features->shaderSharedFloat16Atomics = false; + features->shaderSharedFloat16AtomicAdd = false; + features->shaderSharedFloat16AtomicMinMax = false; + features->shaderSharedFloat32AtomicMinMax = pdevice->info.ver >= 9; + features->shaderSharedFloat64AtomicMinMax = false; + features->shaderImageFloat32AtomicMinMax = false; + features->sparseImageFloat32AtomicMinMax = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: { + VkPhysicalDeviceShaderClockFeaturesKHR *features = + (VkPhysicalDeviceShaderClockFeaturesKHR *)ext; + features->shaderSubgroupClock = true; + features->shaderDeviceClock = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: { + VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *features = + (VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *)ext; + features->shaderIntegerFunctions2 = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_FEATURES_EXT: { + VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *features = + (VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *)ext; + features->shaderModuleIdentifier = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW_FEATURES_KHR: { + VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *features = + (VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *)ext; + features->shaderSubgroupUniformControlFlow = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: { + VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features = + (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext; + features->texelBufferAlignment = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: { + VkPhysicalDeviceTransformFeedbackFeaturesEXT *features = + (VkPhysicalDeviceTransformFeedbackFeaturesEXT *)ext; + features->transformFeedback = true; + features->geometryStreams = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: { + VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features = + (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext; + features->vertexAttributeInstanceRateDivisor = true; + features->vertexAttributeInstanceRateZeroDivisor = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR: { + VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *features = + (VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *)ext; + features->workgroupMemoryExplicitLayout = true; + features->workgroupMemoryExplicitLayoutScalarBlockLayout = true; + features->workgroupMemoryExplicitLayout8BitAccess = true; + features->workgroupMemoryExplicitLayout16BitAccess = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT: { + VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *features = + (VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *)ext; + features->ycbcrImageArrays = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: { + VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features = + (VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *)ext; + features->extendedDynamicState = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: { + VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features = + (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext; + features->extendedDynamicState2 = true; + features->extendedDynamicState2LogicOp = true; + features->extendedDynamicState2PatchControlPoints = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: { + VkPhysicalDeviceMultiDrawFeaturesEXT *features = (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext; + features->multiDraw = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NON_SEAMLESS_CUBE_MAP_FEATURES_EXT : { + VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *features = + (VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *)ext; + features->nonSeamlessCubeMap = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: { + VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *features = + (VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *)ext; + features->primitiveTopologyListRestart = true; + features->primitiveTopologyPatchListRestart = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT: { + VkPhysicalDeviceDepthClipControlFeaturesEXT *features = + (VkPhysicalDeviceDepthClipControlFeaturesEXT *)ext; + features->depthClipControl = true; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } + +} + +#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS 64 + +#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64 +#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS 256 + +#define MAX_CUSTOM_BORDER_COLORS 4096 + +void anv_GetPhysicalDeviceProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties* pProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + const struct intel_device_info *devinfo = &pdevice->info; + + const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64; + const uint32_t max_textures = + pdevice->has_bindless_images ? UINT16_MAX : 128; + const uint32_t max_samplers = + pdevice->has_bindless_samplers ? UINT16_MAX : + (devinfo->verx10 >= 75) ? 128 : 16; + const uint32_t max_images = + pdevice->has_bindless_images ? UINT16_MAX : MAX_IMAGES; + + /* If we can use bindless for everything, claim a high per-stage limit, + * otherwise use the binding table size, minus the slots reserved for + * render targets and one slot for the descriptor buffer. */ + const uint32_t max_per_stage = + pdevice->has_bindless_images && pdevice->has_a64_buffer_access + ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1; + + const uint32_t max_workgroup_size = + MIN2(1024, 32 * devinfo->max_cs_workgroup_threads); + + VkSampleCountFlags sample_counts = + isl_device_get_sample_counts(&pdevice->isl_dev); + + + VkPhysicalDeviceLimits limits = { + .maxImageDimension1D = (1 << 14), + .maxImageDimension2D = (1 << 14), + .maxImageDimension3D = (1 << 11), + .maxImageDimensionCube = (1 << 14), + .maxImageArrayLayers = (1 << 11), + .maxTexelBufferElements = 128 * 1024 * 1024, + .maxUniformBufferRange = pdevice->compiler->indirect_ubos_use_sampler ? (1u << 27) : (1u << 30), + .maxStorageBufferRange = pdevice->isl_dev.max_buffer_size, + .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE, + .maxMemoryAllocationCount = UINT32_MAX, + .maxSamplerAllocationCount = 64 * 1024, + .bufferImageGranularity = 1, + .sparseAddressSpaceSize = 0, + .maxBoundDescriptorSets = MAX_SETS, + .maxPerStageDescriptorSamplers = max_samplers, + .maxPerStageDescriptorUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, + .maxPerStageDescriptorStorageBuffers = max_ssbos, + .maxPerStageDescriptorSampledImages = max_textures, + .maxPerStageDescriptorStorageImages = max_images, + .maxPerStageDescriptorInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS, + .maxPerStageResources = max_per_stage, + .maxDescriptorSetSamplers = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */ + .maxDescriptorSetUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS, /* number of stages * maxPerStageDescriptorUniformBuffers */ + .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */ + .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, + .maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */ + .maxDescriptorSetStorageImages = 6 * max_images, /* number of stages * maxPerStageDescriptorStorageImages */ + .maxDescriptorSetInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS, + .maxVertexInputAttributes = MAX_VES, + .maxVertexInputBindings = MAX_VBS, + /* Broadwell PRMs: Volume 2d: Command Reference: Structures: + * + * VERTEX_ELEMENT_STATE::Source Element Offset: [0,2047] + */ + .maxVertexInputAttributeOffset = 2047, + /* Broadwell PRMs: Volume 2d: Command Reference: Structures: + * + * VERTEX_BUFFER_STATE::Buffer Pitch: [0,2048] + * + * Skylake PRMs: Volume 2d: Command Reference: Structures: + * + * VERTEX_BUFFER_STATE::Buffer Pitch: [0,4095] + */ + .maxVertexInputBindingStride = devinfo->ver < 9 ? 2048 : 4095, + .maxVertexOutputComponents = 128, + .maxTessellationGenerationLevel = 64, + .maxTessellationPatchSize = 32, + .maxTessellationControlPerVertexInputComponents = 128, + .maxTessellationControlPerVertexOutputComponents = 128, + .maxTessellationControlPerPatchOutputComponents = 128, + .maxTessellationControlTotalOutputComponents = 2048, + .maxTessellationEvaluationInputComponents = 128, + .maxTessellationEvaluationOutputComponents = 128, + .maxGeometryShaderInvocations = 32, + .maxGeometryInputComponents = devinfo->ver >= 8 ? 128 : 64, + .maxGeometryOutputComponents = 128, + .maxGeometryOutputVertices = 256, + .maxGeometryTotalOutputComponents = 1024, + .maxFragmentInputComponents = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */ + .maxFragmentOutputAttachments = 8, + .maxFragmentDualSrcAttachments = 1, + .maxFragmentCombinedOutputResources = MAX_RTS + max_ssbos + max_images, + .maxComputeSharedMemorySize = 64 * 1024, + .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, + .maxComputeWorkGroupInvocations = max_workgroup_size, + .maxComputeWorkGroupSize = { + max_workgroup_size, + max_workgroup_size, + max_workgroup_size, + }, + .subPixelPrecisionBits = 8, + .subTexelPrecisionBits = 8, + .mipmapPrecisionBits = 8, + .maxDrawIndexedIndexValue = UINT32_MAX, + .maxDrawIndirectCount = UINT32_MAX, + .maxSamplerLodBias = 16, + .maxSamplerAnisotropy = 16, + .maxViewports = MAX_VIEWPORTS, + .maxViewportDimensions = { (1 << 14), (1 << 14) }, + .viewportBoundsRange = { INT16_MIN, INT16_MAX }, + .viewportSubPixelBits = 13, /* We take a float? */ + .minMemoryMapAlignment = 4096, /* A page */ + /* The dataport requires texel alignment so we need to assume a worst + * case of R32G32B32A32 which is 16 bytes. + */ + .minTexelBufferOffsetAlignment = 16, + .minUniformBufferOffsetAlignment = ANV_UBO_ALIGNMENT, + .minStorageBufferOffsetAlignment = ANV_SSBO_ALIGNMENT, + .minTexelOffset = -8, + .maxTexelOffset = 7, + .minTexelGatherOffset = -32, + .maxTexelGatherOffset = 31, + .minInterpolationOffset = -0.5, + .maxInterpolationOffset = 0.4375, + .subPixelInterpolationOffsetBits = 4, + .maxFramebufferWidth = (1 << 14), + .maxFramebufferHeight = (1 << 14), + .maxFramebufferLayers = (1 << 11), + .framebufferColorSampleCounts = sample_counts, + .framebufferDepthSampleCounts = sample_counts, + .framebufferStencilSampleCounts = sample_counts, + .framebufferNoAttachmentsSampleCounts = sample_counts, + .maxColorAttachments = MAX_RTS, + .sampledImageColorSampleCounts = sample_counts, + .sampledImageIntegerSampleCounts = sample_counts, + .sampledImageDepthSampleCounts = sample_counts, + .sampledImageStencilSampleCounts = sample_counts, + .storageImageSampleCounts = VK_SAMPLE_COUNT_1_BIT, + .maxSampleMaskWords = 1, + .timestampComputeAndGraphics = true, + .timestampPeriod = 1000000000.0 / devinfo->timestamp_frequency, + .maxClipDistances = 8, + .maxCullDistances = 8, + .maxCombinedClipAndCullDistances = 8, + .discreteQueuePriorities = 2, + .pointSizeRange = { 0.125, 255.875 }, + /* While SKL and up support much wider lines than we are setting here, + * in practice we run into conformance issues if we go past this limit. + * Since the Windows driver does the same, it's probably fair to assume + * that no one needs more than this. + */ + .lineWidthRange = { 0.0, devinfo->ver >= 9 ? 8.0 : 7.9921875 }, + .pointSizeGranularity = (1.0 / 8.0), + .lineWidthGranularity = (1.0 / 128.0), + .strictLines = false, + .standardSampleLocations = true, + .optimalBufferCopyOffsetAlignment = 128, + .optimalBufferCopyRowPitchAlignment = 128, + .nonCoherentAtomSize = 64, + }; + + *pProperties = (VkPhysicalDeviceProperties) { + .apiVersion = ANV_API_VERSION, + .driverVersion = vk_get_driver_version(), + .vendorID = 0x8086, + .deviceID = pdevice->info.pci_device_id, + .deviceType = pdevice->info.has_local_mem ? + VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU : + VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU, + .limits = limits, + .sparseProperties = {0}, /* Broadwell doesn't do sparse. */ + }; + + snprintf(pProperties->deviceName, sizeof(pProperties->deviceName), + "%s", pdevice->info.name); + memcpy(pProperties->pipelineCacheUUID, + pdevice->pipeline_cache_uuid, VK_UUID_SIZE); +} + +static void +anv_get_physical_device_properties_1_1(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan11Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES); + + memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE); + memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE); + memset(p->deviceLUID, 0, VK_LUID_SIZE); + p->deviceNodeMask = 0; + p->deviceLUIDValid = false; + + p->subgroupSize = BRW_SUBGROUP_SIZE; + VkShaderStageFlags scalar_stages = 0; + for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) { + if (pdevice->compiler->scalar_stage[stage]) + scalar_stages |= mesa_to_vk_shader_stage(stage); + } + if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) { + scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR; + } + if (pdevice->vk.supported_extensions.NV_mesh_shader) { + scalar_stages |= VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV; + } + p->subgroupSupportedStages = scalar_stages; + p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | + VK_SUBGROUP_FEATURE_QUAD_BIT; + if (pdevice->info.ver >= 8) { + /* TODO: There's no technical reason why these can't be made to + * work on gfx7 but they don't at the moment so it's best to leave + * the feature disabled than enabled and broken. + */ + p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_CLUSTERED_BIT; + } + p->subgroupQuadOperationsInAllStages = pdevice->info.ver >= 8; + + p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY; + p->maxMultiviewViewCount = 16; + p->maxMultiviewInstanceIndex = UINT32_MAX / 16; + p->protectedNoFault = false; + /* This value doesn't matter for us today as our per-stage descriptors are + * the real limit. + */ + p->maxPerSetDescriptors = 1024; + p->maxMemoryAllocationSize = MAX_MEMORY_ALLOCATION_SIZE; +} + +static void +anv_get_physical_device_properties_1_2(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan12Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES); + + p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA; + memset(p->driverName, 0, sizeof(p->driverName)); + snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE, + "Intel open-source Mesa driver"); + memset(p->driverInfo, 0, sizeof(p->driverInfo)); + snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE, + "Mesa " PACKAGE_VERSION MESA_GIT_SHA1); + + /* Don't advertise conformance with a particular version if the hardware's + * support is incomplete/alpha. + */ + if (pdevice->is_alpha) { + p->conformanceVersion = (VkConformanceVersion) { + .major = 0, + .minor = 0, + .subminor = 0, + .patch = 0, + }; + } + else { + p->conformanceVersion = (VkConformanceVersion) { + .major = 1, + .minor = 3, + .subminor = 0, + .patch = 0, + }; + } + + p->denormBehaviorIndependence = + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL; + p->roundingModeIndependence = + VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE; + + /* Broadwell does not support HF denorms and there are restrictions + * other gens. According to Kabylake's PRM: + * + * "math - Extended Math Function + * [...] + * Restriction : Half-float denorms are always retained." + */ + p->shaderDenormFlushToZeroFloat16 = false; + p->shaderDenormPreserveFloat16 = pdevice->info.ver > 8; + p->shaderRoundingModeRTEFloat16 = true; + p->shaderRoundingModeRTZFloat16 = true; + p->shaderSignedZeroInfNanPreserveFloat16 = true; + + p->shaderDenormFlushToZeroFloat32 = true; + p->shaderDenormPreserveFloat32 = true; + p->shaderRoundingModeRTEFloat32 = true; + p->shaderRoundingModeRTZFloat32 = true; + p->shaderSignedZeroInfNanPreserveFloat32 = true; + + p->shaderDenormFlushToZeroFloat64 = true; + p->shaderDenormPreserveFloat64 = true; + p->shaderRoundingModeRTEFloat64 = true; + p->shaderRoundingModeRTZFloat64 = true; + p->shaderSignedZeroInfNanPreserveFloat64 = true; + + /* It's a bit hard to exactly map our implementation to the limits + * described by Vulkan. The bindless surface handle in the extended + * message descriptors is 20 bits and it's an index into the table of + * RENDER_SURFACE_STATE structs that starts at bindless surface base + * address. This means that we can have at must 1M surface states + * allocated at any given time. Since most image views take two + * descriptors, this means we have a limit of about 500K image views. + * + * However, since we allocate surface states at vkCreateImageView time, + * this means our limit is actually something on the order of 500K image + * views allocated at any time. The actual limit describe by Vulkan, on + * the other hand, is a limit of how many you can have in a descriptor set. + * Assuming anyone using 1M descriptors will be using the same image view + * twice a bunch of times (or a bunch of null descriptors), we can safely + * advertise a larger limit here. + */ + const unsigned max_bindless_views = 1 << 20; + p->maxUpdateAfterBindDescriptorsInAllPools = max_bindless_views; + p->shaderUniformBufferArrayNonUniformIndexingNative = false; + p->shaderSampledImageArrayNonUniformIndexingNative = false; + p->shaderStorageBufferArrayNonUniformIndexingNative = true; + p->shaderStorageImageArrayNonUniformIndexingNative = false; + p->shaderInputAttachmentArrayNonUniformIndexingNative = false; + p->robustBufferAccessUpdateAfterBind = true; + p->quadDivergentImplicitLod = false; + p->maxPerStageDescriptorUpdateAfterBindSamplers = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; + p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX; + p->maxPerStageDescriptorUpdateAfterBindSampledImages = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindStorageImages = max_bindless_views; + p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS; + p->maxPerStageUpdateAfterBindResources = UINT32_MAX; + p->maxDescriptorSetUpdateAfterBindSamplers = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindUniformBuffers = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS; + p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; + p->maxDescriptorSetUpdateAfterBindStorageBuffers = UINT32_MAX; + p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2; + p->maxDescriptorSetUpdateAfterBindSampledImages = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindStorageImages = max_bindless_views; + p->maxDescriptorSetUpdateAfterBindInputAttachments = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS; + + /* We support all of the depth resolve modes */ + p->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT | + VK_RESOLVE_MODE_AVERAGE_BIT | + VK_RESOLVE_MODE_MIN_BIT | + VK_RESOLVE_MODE_MAX_BIT; + /* Average doesn't make sense for stencil so we don't support that */ + p->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT; + if (pdevice->info.ver >= 8) { + /* The advanced stencil resolve modes currently require stencil + * sampling be supported by the hardware. + */ + p->supportedStencilResolveModes |= VK_RESOLVE_MODE_MIN_BIT | + VK_RESOLVE_MODE_MAX_BIT; + } + p->independentResolveNone = true; + p->independentResolve = true; + + p->filterMinmaxSingleComponentFormats = pdevice->info.ver >= 9; + p->filterMinmaxImageComponentMapping = pdevice->info.ver >= 9; + + p->maxTimelineSemaphoreValueDifference = UINT64_MAX; + + p->framebufferIntegerColorSampleCounts = + isl_device_get_sample_counts(&pdevice->isl_dev); +} + +static void +anv_get_physical_device_properties_1_3(struct anv_physical_device *pdevice, + VkPhysicalDeviceVulkan13Properties *p) +{ + assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES); + + p->minSubgroupSize = 8; + p->maxSubgroupSize = 32; + p->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads; + p->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT | + VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV; + + p->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE; + p->maxPerStageDescriptorInlineUniformBlocks = + MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS; + p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = + MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS; + p->maxDescriptorSetInlineUniformBlocks = + MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS; + p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = + MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS; + p->maxInlineUniformTotalSize = UINT16_MAX; + + p->integerDotProduct8BitUnsignedAccelerated = false; + p->integerDotProduct8BitSignedAccelerated = false; + p->integerDotProduct8BitMixedSignednessAccelerated = false; + p->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12; + p->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12; + p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12; + p->integerDotProduct16BitUnsignedAccelerated = false; + p->integerDotProduct16BitSignedAccelerated = false; + p->integerDotProduct16BitMixedSignednessAccelerated = false; + p->integerDotProduct32BitUnsignedAccelerated = false; + p->integerDotProduct32BitSignedAccelerated = false; + p->integerDotProduct32BitMixedSignednessAccelerated = false; + p->integerDotProduct64BitUnsignedAccelerated = false; + p->integerDotProduct64BitSignedAccelerated = false; + p->integerDotProduct64BitMixedSignednessAccelerated = false; + p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false; + p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false; + p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false; + p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12; + p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12; + p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12; + p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false; + p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false; + p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false; + p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false; + p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false; + p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false; + p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false; + p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false; + p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false; + + /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface + * Base Address: + * + * "For SURFTYPE_BUFFER non-rendertarget surfaces, this field + * specifies the base address of the first element of the surface, + * computed in software by adding the surface base address to the + * byte offset of the element in the buffer. The base address must + * be aligned to element size." + * + * The typed dataport messages require that things be texel aligned. + * Otherwise, we may just load/store the wrong data or, in the worst + * case, there may be hangs. + */ + p->storageTexelBufferOffsetAlignmentBytes = 16; + p->storageTexelBufferOffsetSingleTexelAlignment = true; + + /* The sampler, however, is much more forgiving and it can handle + * arbitrary byte alignment for linear and buffer surfaces. It's + * hard to find a good PRM citation for this but years of empirical + * experience demonstrate that this is true. + */ + p->uniformTexelBufferOffsetAlignmentBytes = 1; + p->uniformTexelBufferOffsetSingleTexelAlignment = false; + + p->maxBufferSize = pdevice->isl_dev.max_buffer_size; +} + +void anv_GetPhysicalDeviceProperties2( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties2* pProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + + anv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties); + + VkPhysicalDeviceVulkan11Properties core_1_1 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES, + }; + anv_get_physical_device_properties_1_1(pdevice, &core_1_1); + + VkPhysicalDeviceVulkan12Properties core_1_2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES, + }; + anv_get_physical_device_properties_1_2(pdevice, &core_1_2); + + VkPhysicalDeviceVulkan13Properties core_1_3 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES, + }; + anv_get_physical_device_properties_1_3(pdevice, &core_1_3); + + vk_foreach_struct(ext, pProperties->pNext) { + if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1)) + continue; + if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2)) + continue; + if (vk_get_physical_device_core_1_3_property_ext(ext, &core_1_3)) + continue; + + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR: { + VkPhysicalDeviceAccelerationStructurePropertiesKHR *props = (void *)ext; + props->maxGeometryCount = (1u << 24) - 1; + props->maxInstanceCount = (1u << 24) - 1; + props->maxPrimitiveCount = (1u << 29) - 1; + props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX; + props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX; + props->maxDescriptorSetAccelerationStructures = UINT16_MAX; + props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX; + props->minAccelerationStructureScratchOffsetAlignment = 64; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: { + /* TODO: Real limits */ + VkPhysicalDeviceConservativeRasterizationPropertiesEXT *properties = + (VkPhysicalDeviceConservativeRasterizationPropertiesEXT *)ext; + /* There's nothing in the public docs about this value as far as I + * can tell. However, this is the value the Windows driver reports + * and there's a comment on a rejected HW feature in the internal + * docs that says: + * + * "This is similar to conservative rasterization, except the + * primitive area is not extended by 1/512 and..." + * + * That's a bit of an obtuse reference but it's the best we've got + * for now. + */ + properties->primitiveOverestimationSize = 1.0f / 512.0f; + properties->maxExtraPrimitiveOverestimationSize = 0.0f; + properties->extraPrimitiveOverestimationSizeGranularity = 0.0f; + properties->primitiveUnderestimation = false; + properties->conservativePointAndLineRasterization = false; + properties->degenerateTrianglesRasterized = true; + properties->degenerateLinesRasterized = false; + properties->fullyCoveredFragmentShaderInputVariable = false; + properties->conservativeRasterizationPostDepthCoverage = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: { + VkPhysicalDeviceCustomBorderColorPropertiesEXT *properties = + (VkPhysicalDeviceCustomBorderColorPropertiesEXT *)ext; + properties->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR: { + VkPhysicalDeviceFragmentShadingRatePropertiesKHR *props = + (VkPhysicalDeviceFragmentShadingRatePropertiesKHR *)ext; + props->primitiveFragmentShadingRateWithMultipleViewports = + pdevice->info.has_coarse_pixel_primitive_and_cb; + props->layeredShadingRateAttachments = pdevice->info.has_coarse_pixel_primitive_and_cb; + props->fragmentShadingRateNonTrivialCombinerOps = + pdevice->info.has_coarse_pixel_primitive_and_cb; + props->maxFragmentSize = (VkExtent2D) { 4, 4 }; + props->maxFragmentSizeAspectRatio = + pdevice->info.has_coarse_pixel_primitive_and_cb ? + 2 : 4; + props->maxFragmentShadingRateCoverageSamples = 4 * 4 * + (pdevice->info.has_coarse_pixel_primitive_and_cb ? 4 : 16); + props->maxFragmentShadingRateRasterizationSamples = + pdevice->info.has_coarse_pixel_primitive_and_cb ? + VK_SAMPLE_COUNT_4_BIT : VK_SAMPLE_COUNT_16_BIT; + props->fragmentShadingRateWithShaderDepthStencilWrites = false; + props->fragmentShadingRateWithSampleMask = true; + props->fragmentShadingRateWithShaderSampleMask = false; + props->fragmentShadingRateWithConservativeRasterization = true; + props->fragmentShadingRateWithFragmentShaderInterlock = true; + props->fragmentShadingRateWithCustomSampleLocations = true; + + /* Fix in DG2_G10_C0 and DG2_G11_B0. Consider any other Sku as having + * the fix. + */ + props->fragmentShadingRateStrictMultiplyCombiner = + pdevice->info.platform == INTEL_PLATFORM_DG2_G10 ? + pdevice->info.revision >= 8 : + pdevice->info.platform == INTEL_PLATFORM_DG2_G11 ? + pdevice->info.revision >= 4 : true; + + if (pdevice->info.has_coarse_pixel_primitive_and_cb) { + props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 }; + props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 }; + props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 1; + } else { + /* Those must be 0 if attachmentFragmentShadingRate is not + * supported. + */ + props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 }; + props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 }; + props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0; + } + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { + VkPhysicalDeviceDrmPropertiesEXT *props = + (VkPhysicalDeviceDrmPropertiesEXT *)ext; + + props->hasPrimary = pdevice->has_master; + props->primaryMajor = pdevice->master_major; + props->primaryMinor = pdevice->master_minor; + + props->hasRender = pdevice->has_local; + props->renderMajor = pdevice->local_major; + props->renderMinor = pdevice->local_minor; + + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT: { + VkPhysicalDeviceExternalMemoryHostPropertiesEXT *props = + (VkPhysicalDeviceExternalMemoryHostPropertiesEXT *) ext; + /* Userptr needs page aligned memory. */ + props->minImportedHostPointerAlignment = 4096; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: { + VkPhysicalDeviceLineRasterizationPropertiesEXT *props = + (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext; + /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond) + * Sampling Rules - Legacy Mode", it says the following: + * + * "Note that the device divides a pixel into a 16x16 array of + * subpixels, referenced by their upper left corners." + * + * This is the only known reference in the PRMs to the subpixel + * precision of line rasterization and a "16x16 array of subpixels" + * implies 4 subpixel precision bits. Empirical testing has shown + * that 4 subpixel precision bits applies to all line rasterization + * types. + */ + props->lineSubPixelPrecisionBits = 4; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_PROPERTIES: { + VkPhysicalDeviceMaintenance4Properties *properties = + (VkPhysicalDeviceMaintenance4Properties *)ext; + properties->maxBufferSize = pdevice->isl_dev.max_buffer_size; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_PROPERTIES_NV: { + VkPhysicalDeviceMeshShaderPropertiesNV *props = + (VkPhysicalDeviceMeshShaderPropertiesNV *)ext; + + /* Bounded by the maximum representable size in + * 3DSTATE_MESH_SHADER_BODY::SharedLocalMemorySize. Same for Task. + */ + const uint32_t max_slm_size = 64 * 1024; + + /* Bounded by the maximum representable size in + * 3DSTATE_MESH_SHADER_BODY::LocalXMaximum. Same for Task. + */ + const uint32_t max_workgroup_size = 1 << 10; + + /* Bounded by the maximum representable count in + * 3DSTATE_MESH_SHADER_BODY::MaximumPrimitiveCount. + */ + const uint32_t max_primitives = 1024; + + /* TODO(mesh): Multiview. */ + const uint32_t max_view_count = 1; + + props->maxDrawMeshTasksCount = UINT32_MAX; + + /* TODO(mesh): Implement workgroup Y and Z sizes larger than one by + * mapping them to/from the single value that HW provides us + * (currently used for X). + */ + + props->maxTaskWorkGroupInvocations = max_workgroup_size; + props->maxTaskWorkGroupSize[0] = max_workgroup_size; + props->maxTaskWorkGroupSize[1] = 1; + props->maxTaskWorkGroupSize[2] = 1; + props->maxTaskTotalMemorySize = max_slm_size; + props->maxTaskOutputCount = UINT16_MAX; + + props->maxMeshWorkGroupInvocations = max_workgroup_size; + props->maxMeshWorkGroupSize[0] = max_workgroup_size; + props->maxMeshWorkGroupSize[1] = 1; + props->maxMeshWorkGroupSize[2] = 1; + props->maxMeshTotalMemorySize = max_slm_size / max_view_count; + props->maxMeshOutputPrimitives = max_primitives / max_view_count; + props->maxMeshMultiviewViewCount = max_view_count; + + /* Depends on what indices can be represented with IndexFormat. For + * now we always use U32, so bound to the maximum unique vertices we + * need for the maximum primitives. + * + * TODO(mesh): Revisit this if we drop "U32" IndexFormat when adding + * support for others. + */ + props->maxMeshOutputVertices = 3 * props->maxMeshOutputPrimitives; + + + props->meshOutputPerVertexGranularity = 32; + props->meshOutputPerPrimitiveGranularity = 32; + + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: { + VkPhysicalDevicePCIBusInfoPropertiesEXT *properties = + (VkPhysicalDevicePCIBusInfoPropertiesEXT *)ext; + properties->pciDomain = pdevice->info.pci_domain; + properties->pciBus = pdevice->info.pci_bus; + properties->pciDevice = pdevice->info.pci_dev; + properties->pciFunction = pdevice->info.pci_func; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: { + VkPhysicalDevicePerformanceQueryPropertiesKHR *properties = + (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext; + /* We could support this by spawning a shader to do the equation + * normalization. + */ + properties->allowCommandBufferQueryCopies = false; + break; + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: { + VkPhysicalDevicePresentationPropertiesANDROID *props = + (VkPhysicalDevicePresentationPropertiesANDROID *)ext; + props->sharedImage = VK_FALSE; + break; + } +#pragma GCC diagnostic pop + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { + VkPhysicalDeviceProvokingVertexPropertiesEXT *properties = + (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext; + properties->provokingVertexModePerPipeline = true; + properties->transformFeedbackPreservesTriangleFanProvokingVertex = false; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: { + VkPhysicalDevicePushDescriptorPropertiesKHR *properties = + (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext; + properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: { + VkPhysicalDeviceRobustness2PropertiesEXT *properties = (void *)ext; + properties->robustStorageBufferAccessSizeAlignment = + ANV_SSBO_BOUNDS_CHECK_ALIGNMENT; + properties->robustUniformBufferAccessSizeAlignment = + ANV_UBO_ALIGNMENT; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: { + VkPhysicalDeviceSampleLocationsPropertiesEXT *props = + (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext; + + props->sampleLocationSampleCounts = + isl_device_get_sample_counts(&pdevice->isl_dev); + + /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */ + props->maxSampleLocationGridSize.width = 1; + props->maxSampleLocationGridSize.height = 1; + + props->sampleLocationCoordinateRange[0] = 0; + props->sampleLocationCoordinateRange[1] = 0.9375; + props->sampleLocationSubPixelBits = 4; + + props->variableSampleLocations = true; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: { + VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props = + (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext; + STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) == + sizeof(props->shaderModuleIdentifierAlgorithmUUID)); + memcpy(props->shaderModuleIdentifierAlgorithmUUID, + vk_shaderModuleIdentifierAlgorithmUUID, + sizeof(props->shaderModuleIdentifierAlgorithmUUID)); + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: { + VkPhysicalDeviceTransformFeedbackPropertiesEXT *props = + (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext; + + props->maxTransformFeedbackStreams = MAX_XFB_STREAMS; + props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS; + props->maxTransformFeedbackBufferSize = (1ull << 32); + props->maxTransformFeedbackStreamDataSize = 128 * 4; + props->maxTransformFeedbackBufferDataSize = 128 * 4; + props->maxTransformFeedbackBufferDataStride = 2048; + props->transformFeedbackQueries = true; + props->transformFeedbackStreamsLinesTriangles = false; + props->transformFeedbackRasterizationStreamSelect = false; + /* This requires MI_MATH */ + props->transformFeedbackDraw = pdevice->info.verx10 >= 75; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: { + VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props = + (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext; + /* We have to restrict this a bit for multiview */ + props->maxVertexAttribDivisor = UINT32_MAX / 16; + break; + } + + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: { + VkPhysicalDeviceMultiDrawPropertiesEXT *props = (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext; + props->maxMultiDrawCount = 2048; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } +} + +static int +vk_priority_to_gen(int priority) +{ + switch (priority) { + case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR: + return INTEL_CONTEXT_LOW_PRIORITY; + case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR: + return INTEL_CONTEXT_MEDIUM_PRIORITY; + case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR: + return INTEL_CONTEXT_HIGH_PRIORITY; + case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR: + return INTEL_CONTEXT_REALTIME_PRIORITY; + default: + unreachable("Invalid priority"); + } +} + +static const VkQueueFamilyProperties +anv_queue_family_properties_template = { + .timestampValidBits = 36, /* XXX: Real value here */ + .minImageTransferGranularity = { 1, 1, 1 }, +}; + +void anv_GetPhysicalDeviceQueueFamilyProperties2( + VkPhysicalDevice physicalDevice, + uint32_t* pQueueFamilyPropertyCount, + VkQueueFamilyProperties2* pQueueFamilyProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out, + pQueueFamilyProperties, pQueueFamilyPropertyCount); + + for (uint32_t i = 0; i < pdevice->queue.family_count; i++) { + struct anv_queue_family *queue_family = &pdevice->queue.families[i]; + vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) { + p->queueFamilyProperties = anv_queue_family_properties_template; + p->queueFamilyProperties.queueFlags = queue_family->queueFlags; + p->queueFamilyProperties.queueCount = queue_family->queueCount; + + vk_foreach_struct(ext, p->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: { + VkQueueFamilyGlobalPriorityPropertiesKHR *properties = + (VkQueueFamilyGlobalPriorityPropertiesKHR *)ext; + + /* Deliberately sorted low to high */ + VkQueueGlobalPriorityKHR all_priorities[] = { + VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR, + VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR, + VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR, + VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR, + }; + + uint32_t count = 0; + for (unsigned i = 0; i < ARRAY_SIZE(all_priorities); i++) { + if (vk_priority_to_gen(all_priorities[i]) > + pdevice->max_context_priority) + break; + + properties->priorities[count++] = all_priorities[i]; + } + properties->priorityCount = count; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + } + } + } + } +} + +void anv_GetPhysicalDeviceMemoryProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties* pMemoryProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + + pMemoryProperties->memoryTypeCount = physical_device->memory.type_count; + for (uint32_t i = 0; i < physical_device->memory.type_count; i++) { + pMemoryProperties->memoryTypes[i] = (VkMemoryType) { + .propertyFlags = physical_device->memory.types[i].propertyFlags, + .heapIndex = physical_device->memory.types[i].heapIndex, + }; + } + + pMemoryProperties->memoryHeapCount = physical_device->memory.heap_count; + for (uint32_t i = 0; i < physical_device->memory.heap_count; i++) { + pMemoryProperties->memoryHeaps[i] = (VkMemoryHeap) { + .size = physical_device->memory.heaps[i].size, + .flags = physical_device->memory.heaps[i].flags, + }; + } +} + +static void +anv_get_memory_budget(VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryBudgetPropertiesEXT *memoryBudget) +{ + ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice); + + if (!device->vk.supported_extensions.EXT_memory_budget) + return; + + anv_update_meminfo(device, device->local_fd); + + VkDeviceSize total_sys_heaps_size = 0, total_vram_heaps_size = 0; + for (size_t i = 0; i < device->memory.heap_count; i++) { + if (device->memory.heaps[i].is_local_mem) { + total_vram_heaps_size += device->memory.heaps[i].size; + } else { + total_sys_heaps_size += device->memory.heaps[i].size; + } + } + + for (size_t i = 0; i < device->memory.heap_count; i++) { + VkDeviceSize heap_size = device->memory.heaps[i].size; + VkDeviceSize heap_used = device->memory.heaps[i].used; + VkDeviceSize heap_budget, total_heaps_size; + uint64_t mem_available = 0; + + if (device->memory.heaps[i].is_local_mem) { + total_heaps_size = total_vram_heaps_size; + if (device->vram_non_mappable.size > 0 && i == 0) { + mem_available = device->vram_non_mappable.available; + } else { + mem_available = device->vram_mappable.available; + } + } else { + total_heaps_size = total_sys_heaps_size; + mem_available = device->sys.available; + } + + double heap_proportion = (double) heap_size / total_heaps_size; + VkDeviceSize available_prop = mem_available * heap_proportion; + + /* + * Let's not incite the app to starve the system: report at most 90% of + * the available heap memory. + */ + uint64_t heap_available = available_prop * 9 / 10; + heap_budget = MIN2(heap_size, heap_used + heap_available); + + /* + * Round down to the nearest MB + */ + heap_budget &= ~((1ull << 20) - 1); + + /* + * The heapBudget value must be non-zero for array elements less than + * VkPhysicalDeviceMemoryProperties::memoryHeapCount. The heapBudget + * value must be less than or equal to VkMemoryHeap::size for each heap. + */ + assert(0 < heap_budget && heap_budget <= heap_size); + + memoryBudget->heapUsage[i] = heap_used; + memoryBudget->heapBudget[i] = heap_budget; + } + + /* The heapBudget and heapUsage values must be zero for array elements + * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount + */ + for (uint32_t i = device->memory.heap_count; i < VK_MAX_MEMORY_HEAPS; i++) { + memoryBudget->heapBudget[i] = 0; + memoryBudget->heapUsage[i] = 0; + } +} + +void anv_GetPhysicalDeviceMemoryProperties2( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2* pMemoryProperties) +{ + anv_GetPhysicalDeviceMemoryProperties(physicalDevice, + &pMemoryProperties->memoryProperties); + + vk_foreach_struct(ext, pMemoryProperties->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT: + anv_get_memory_budget(physicalDevice, (void*)ext); + break; + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } +} + +void +anv_GetDeviceGroupPeerMemoryFeatures( + VkDevice device, + uint32_t heapIndex, + uint32_t localDeviceIndex, + uint32_t remoteDeviceIndex, + VkPeerMemoryFeatureFlags* pPeerMemoryFeatures) +{ + assert(localDeviceIndex == 0 && remoteDeviceIndex == 0); + *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT | + VK_PEER_MEMORY_FEATURE_COPY_DST_BIT | + VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT | + VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT; +} + +PFN_vkVoidFunction anv_GetInstanceProcAddr( + VkInstance _instance, + const char* pName) +{ + ANV_FROM_HANDLE(anv_instance, instance, _instance); + return vk_instance_get_proc_addr(&instance->vk, + &anv_instance_entrypoints, + pName); +} + +/* With version 1+ of the loader interface the ICD should expose + * vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps. + */ +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr( + VkInstance instance, + const char* pName); + +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr( + VkInstance instance, + const char* pName) +{ + return anv_GetInstanceProcAddr(instance, pName); +} + +/* With version 4+ of the loader interface the ICD should expose + * vk_icdGetPhysicalDeviceProcAddr() + */ +PUBLIC +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetPhysicalDeviceProcAddr( + VkInstance _instance, + const char* pName); + +PFN_vkVoidFunction vk_icdGetPhysicalDeviceProcAddr( + VkInstance _instance, + const char* pName) +{ + ANV_FROM_HANDLE(anv_instance, instance, _instance); + return vk_instance_get_physical_device_proc_addr(&instance->vk, pName); +} + +static struct anv_state +anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p) +{ + struct anv_state state; + + state = anv_state_pool_alloc(pool, size, align); + memcpy(state.map, p, size); + + return state; +} + +static void +anv_device_init_border_colors(struct anv_device *device) +{ + if (device->info->platform == INTEL_PLATFORM_HSW) { + static const struct hsw_border_color border_colors[] = { + [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } }, + [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } }, + [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } }, + [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } }, + [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } }, + [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } }, + }; + + device->border_colors = + anv_state_pool_emit_data(&device->dynamic_state_pool, + sizeof(border_colors), 512, border_colors); + } else { + static const struct gfx8_border_color border_colors[] = { + [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 0.0 } }, + [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] = { .float32 = { 0.0, 0.0, 0.0, 1.0 } }, + [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] = { .float32 = { 1.0, 1.0, 1.0, 1.0 } }, + [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] = { .uint32 = { 0, 0, 0, 0 } }, + [VK_BORDER_COLOR_INT_OPAQUE_BLACK] = { .uint32 = { 0, 0, 0, 1 } }, + [VK_BORDER_COLOR_INT_OPAQUE_WHITE] = { .uint32 = { 1, 1, 1, 1 } }, + }; + + device->border_colors = + anv_state_pool_emit_data(&device->dynamic_state_pool, + sizeof(border_colors), 64, border_colors); + } +} + +static VkResult +anv_device_init_trivial_batch(struct anv_device *device) +{ + VkResult result = anv_device_alloc_bo(device, "trivial-batch", 4096, + ANV_BO_ALLOC_MAPPED, + 0 /* explicit_address */, + &device->trivial_batch_bo); + if (result != VK_SUCCESS) + return result; + + struct anv_batch batch = { + .start = device->trivial_batch_bo->map, + .next = device->trivial_batch_bo->map, + .end = device->trivial_batch_bo->map + 4096, + }; + + anv_batch_emit(&batch, GFX7_MI_BATCH_BUFFER_END, bbe); + anv_batch_emit(&batch, GFX7_MI_NOOP, noop); + + if (device->physical->memory.need_clflush) + intel_clflush_range(batch.start, batch.next - batch.start); + + return VK_SUCCESS; +} + +static bool +get_bo_from_pool(struct intel_batch_decode_bo *ret, + struct anv_block_pool *pool, + uint64_t address) +{ + anv_block_pool_foreach_bo(bo, pool) { + uint64_t bo_address = intel_48b_address(bo->offset); + if (address >= bo_address && address < (bo_address + bo->size)) { + *ret = (struct intel_batch_decode_bo) { + .addr = bo_address, + .size = bo->size, + .map = bo->map, + }; + return true; + } + } + return false; +} + +/* Finding a buffer for batch decoding */ +static struct intel_batch_decode_bo +decode_get_bo(void *v_batch, bool ppgtt, uint64_t address) +{ + struct anv_device *device = v_batch; + struct intel_batch_decode_bo ret_bo = {}; + + assert(ppgtt); + + if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address)) + return ret_bo; + if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address)) + return ret_bo; + if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address)) + return ret_bo; + if (get_bo_from_pool(&ret_bo, &device->surface_state_pool.block_pool, address)) + return ret_bo; + + if (!device->cmd_buffer_being_decoded) + return (struct intel_batch_decode_bo) { }; + + struct anv_batch_bo **bo; + + u_vector_foreach(bo, &device->cmd_buffer_being_decoded->seen_bbos) { + /* The decoder zeroes out the top 16 bits, so we need to as well */ + uint64_t bo_address = (*bo)->bo->offset & (~0ull >> 16); + + if (address >= bo_address && address < bo_address + (*bo)->bo->size) { + return (struct intel_batch_decode_bo) { + .addr = bo_address, + .size = (*bo)->bo->size, + .map = (*bo)->bo->map, + }; + } + } + + return (struct intel_batch_decode_bo) { }; +} + +struct intel_aux_map_buffer { + struct intel_buffer base; + struct anv_state state; +}; + +static struct intel_buffer * +intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size) +{ + struct intel_aux_map_buffer *buf = malloc(sizeof(struct intel_aux_map_buffer)); + if (!buf) + return NULL; + + struct anv_device *device = (struct anv_device*)driver_ctx; + assert(device->physical->supports_48bit_addresses && + device->physical->use_softpin); + + struct anv_state_pool *pool = &device->dynamic_state_pool; + buf->state = anv_state_pool_alloc(pool, size, size); + + buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset; + buf->base.gpu_end = buf->base.gpu + buf->state.alloc_size; + buf->base.map = buf->state.map; + buf->base.driver_bo = &buf->state; + return &buf->base; +} + +static void +intel_aux_map_buffer_free(void *driver_ctx, struct intel_buffer *buffer) +{ + struct intel_aux_map_buffer *buf = (struct intel_aux_map_buffer*)buffer; + struct anv_device *device = (struct anv_device*)driver_ctx; + struct anv_state_pool *pool = &device->dynamic_state_pool; + anv_state_pool_free(pool, buf->state); + free(buf); +} + +static struct intel_mapped_pinned_buffer_alloc aux_map_allocator = { + .alloc = intel_aux_map_buffer_alloc, + .free = intel_aux_map_buffer_free, +}; + +static VkResult anv_device_check_status(struct vk_device *vk_device); + +static VkResult +anv_device_setup_context(struct anv_device *device, + const VkDeviceCreateInfo *pCreateInfo, + const uint32_t num_queues) +{ + struct anv_physical_device *physical_device = device->physical; + VkResult result = VK_SUCCESS; + + if (device->physical->engine_info) { + /* The kernel API supports at most 64 engines */ + assert(num_queues <= 64); + uint16_t engine_classes[64]; + int engine_count = 0; + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + const VkDeviceQueueCreateInfo *queueCreateInfo = + &pCreateInfo->pQueueCreateInfos[i]; + + assert(queueCreateInfo->queueFamilyIndex < + physical_device->queue.family_count); + struct anv_queue_family *queue_family = + &physical_device->queue.families[queueCreateInfo->queueFamilyIndex]; + + for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) + engine_classes[engine_count++] = queue_family->engine_class; + } + device->context_id = + intel_gem_create_context_engines(device->fd, + physical_device->engine_info, + engine_count, engine_classes); + } else { + assert(num_queues == 1); + device->context_id = anv_gem_create_context(device); + } + + if (device->context_id == -1) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + return result; + } + + /* Here we tell the kernel not to attempt to recover our context but + * immediately (on the next batchbuffer submission) report that the + * context is lost, and we will do the recovery ourselves. In the case + * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting + * the client clean up the pieces. + */ + anv_gem_set_context_param(device->fd, device->context_id, + I915_CONTEXT_PARAM_RECOVERABLE, false); + + /* Check if client specified queue priority. */ + const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority = + vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext, + DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR); + + VkQueueGlobalPriorityKHR priority = + queue_priority ? queue_priority->globalPriority : + VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR; + + /* As per spec, the driver implementation may deny requests to acquire + * a priority above the default priority (MEDIUM) if the caller does not + * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR + * is returned. + */ + if (physical_device->max_context_priority >= INTEL_CONTEXT_MEDIUM_PRIORITY) { + int err = anv_gem_set_context_param(device->fd, device->context_id, + I915_CONTEXT_PARAM_PRIORITY, + vk_priority_to_gen(priority)); + if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) { + result = vk_error(device, VK_ERROR_NOT_PERMITTED_KHR); + goto fail_context; + } + } + + return result; + +fail_context: + anv_gem_destroy_context(device, device->context_id); + return result; +} + +VkResult anv_CreateDevice( + VkPhysicalDevice physicalDevice, + const VkDeviceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDevice* pDevice) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + VkResult result; + struct anv_device *device; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO); + + /* Check enabled features */ + bool robust_buffer_access = false; + if (pCreateInfo->pEnabledFeatures) { + if (pCreateInfo->pEnabledFeatures->robustBufferAccess) + robust_buffer_access = true; + } + + vk_foreach_struct_const(ext, pCreateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: { + const VkPhysicalDeviceFeatures2 *features = (const void *)ext; + if (features->features.robustBufferAccess) + robust_buffer_access = true; + break; + } + + default: + /* Don't warn */ + break; + } + } + + /* Check requested queues and fail if we are requested to create any + * queues with flags we don't support. + */ + assert(pCreateInfo->queueCreateInfoCount > 0); + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + if (pCreateInfo->pQueueCreateInfos[i].flags != 0) + return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED); + } + + device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator, + sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!device) + return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct vk_device_dispatch_table dispatch_table; + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + anv_genX(&physical_device->info, device_entrypoints), true); + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + &anv_device_entrypoints, false); + vk_device_dispatch_table_from_entrypoints(&dispatch_table, + &wsi_device_entrypoints, false); + + result = vk_device_init(&device->vk, &physical_device->vk, + &dispatch_table, pCreateInfo, pAllocator); + if (result != VK_SUCCESS) + goto fail_alloc; + + if (INTEL_DEBUG(DEBUG_BATCH)) { + const unsigned decode_flags = + INTEL_BATCH_DECODE_FULL | + (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) | + INTEL_BATCH_DECODE_OFFSETS | + INTEL_BATCH_DECODE_FLOATS; + + intel_batch_decode_ctx_init(&device->decoder_ctx, + &physical_device->compiler->isa, + &physical_device->info, + stderr, decode_flags, NULL, + decode_get_bo, NULL, device); + + device->decoder_ctx.dynamic_base = DYNAMIC_STATE_POOL_MIN_ADDRESS; + device->decoder_ctx.surface_base = SURFACE_STATE_POOL_MIN_ADDRESS; + device->decoder_ctx.instruction_base = + INSTRUCTION_STATE_POOL_MIN_ADDRESS; + } + + anv_device_set_physical(device, physical_device); + + /* XXX(chadv): Can we dup() physicalDevice->fd here? */ + device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC); + if (device->fd == -1) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_device; + } + + device->vk.check_status = anv_device_check_status; + device->vk.create_sync_for_memory = anv_create_sync_for_memory; + vk_device_set_drm_fd(&device->vk, device->fd); + + uint32_t num_queues = 0; + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) + num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount; + + result = anv_device_setup_context(device, pCreateInfo, num_queues); + if (result != VK_SUCCESS) + goto fail_fd; + + device->queues = + vk_zalloc(&device->vk.alloc, num_queues * sizeof(*device->queues), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (device->queues == NULL) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_context_id; + } + + device->queue_count = 0; + for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { + const VkDeviceQueueCreateInfo *queueCreateInfo = + &pCreateInfo->pQueueCreateInfos[i]; + + for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) { + /* When using legacy contexts, we use I915_EXEC_RENDER but, with + * engine-based contexts, the bottom 6 bits of exec_flags are used + * for the engine ID. + */ + uint32_t exec_flags = device->physical->engine_info ? + device->queue_count : I915_EXEC_RENDER; + + result = anv_queue_init(device, &device->queues[device->queue_count], + exec_flags, queueCreateInfo, j); + if (result != VK_SUCCESS) + goto fail_queues; + + device->queue_count++; + } + } + + if (!anv_use_relocations(physical_device)) { + if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_queues; + } + + /* keep the page with address zero out of the allocator */ + util_vma_heap_init(&device->vma_lo, + LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE); + + util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS, + CLIENT_VISIBLE_HEAP_SIZE); + + /* Leave the last 4GiB out of the high vma range, so that no state + * base address + size can overflow 48 bits. For more information see + * the comment about Wa32bitGeneralStateOffset in anv_allocator.c + */ + util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS, + physical_device->gtt_size - (1ull << 32) - + HIGH_HEAP_MIN_ADDRESS); + } + + list_inithead(&device->memory_objects); + + /* On Broadwell and later, we can use batch chaining to more efficiently + * implement growing command buffers. Prior to Haswell, the kernel + * command parser gets in the way and we have to fall back to growing + * the batch. + */ + device->can_chain_batches = device->info->ver >= 8; + + device->robust_buffer_access = robust_buffer_access; + + if (pthread_mutex_init(&device->mutex, NULL) != 0) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_vmas; + } + + pthread_condattr_t condattr; + if (pthread_condattr_init(&condattr) != 0) { + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_mutex; + } + if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) { + pthread_condattr_destroy(&condattr); + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_mutex; + } + if (pthread_cond_init(&device->queue_submit, &condattr) != 0) { + pthread_condattr_destroy(&condattr); + result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + goto fail_mutex; + } + pthread_condattr_destroy(&condattr); + + result = anv_bo_cache_init(&device->bo_cache, device); + if (result != VK_SUCCESS) + goto fail_queue_cond; + + anv_bo_pool_init(&device->batch_bo_pool, device, "batch"); + + /* Because scratch is also relative to General State Base Address, we leave + * the base address 0 and start the pool memory at an offset. This way we + * get the correct offsets in the anv_states that get allocated from it. + */ + result = anv_state_pool_init(&device->general_state_pool, device, + "general pool", + 0, GENERAL_STATE_POOL_MIN_ADDRESS, 16384); + if (result != VK_SUCCESS) + goto fail_batch_bo_pool; + + result = anv_state_pool_init(&device->dynamic_state_pool, device, + "dynamic pool", + DYNAMIC_STATE_POOL_MIN_ADDRESS, 0, 16384); + if (result != VK_SUCCESS) + goto fail_general_state_pool; + + if (device->info->ver >= 8) { + /* The border color pointer is limited to 24 bits, so we need to make + * sure that any such color used at any point in the program doesn't + * exceed that limit. + * We achieve that by reserving all the custom border colors we support + * right off the bat, so they are close to the base address. + */ + anv_state_reserved_pool_init(&device->custom_border_colors, + &device->dynamic_state_pool, + MAX_CUSTOM_BORDER_COLORS, + sizeof(struct gfx8_border_color), 64); + } + + result = anv_state_pool_init(&device->instruction_state_pool, device, + "instruction pool", + INSTRUCTION_STATE_POOL_MIN_ADDRESS, 0, 16384); + if (result != VK_SUCCESS) + goto fail_dynamic_state_pool; + + result = anv_state_pool_init(&device->surface_state_pool, device, + "surface state pool", + SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096); + if (result != VK_SUCCESS) + goto fail_instruction_state_pool; + + if (device->info->verx10 >= 125) { + /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding + * table its own base address separately from surface state base. + */ + result = anv_state_pool_init(&device->binding_table_pool, device, + "binding table pool", + BINDING_TABLE_POOL_MIN_ADDRESS, 0, + BINDING_TABLE_POOL_BLOCK_SIZE); + } else if (!anv_use_relocations(physical_device)) { + int64_t bt_pool_offset = (int64_t)BINDING_TABLE_POOL_MIN_ADDRESS - + (int64_t)SURFACE_STATE_POOL_MIN_ADDRESS; + assert(INT32_MIN < bt_pool_offset && bt_pool_offset < 0); + result = anv_state_pool_init(&device->binding_table_pool, device, + "binding table pool", + SURFACE_STATE_POOL_MIN_ADDRESS, + bt_pool_offset, + BINDING_TABLE_POOL_BLOCK_SIZE); + } + if (result != VK_SUCCESS) + goto fail_surface_state_pool; + + if (device->info->has_aux_map) { + device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator, + &physical_device->info); + if (!device->aux_map_ctx) + goto fail_binding_table_pool; + } + + result = anv_device_alloc_bo(device, "workaround", 4096, + ANV_BO_ALLOC_CAPTURE | + ANV_BO_ALLOC_MAPPED, + 0 /* explicit_address */, + &device->workaround_bo); + if (result != VK_SUCCESS) + goto fail_surface_aux_map_pool; + + device->workaround_address = (struct anv_address) { + .bo = device->workaround_bo, + .offset = align_u32( + intel_debug_write_identifiers(device->workaround_bo->map, + device->workaround_bo->size, + "Anv") + 8, 8), + }; + + device->debug_frame_desc = + intel_debug_get_identifier_block(device->workaround_bo->map, + device->workaround_bo->size, + INTEL_DEBUG_BLOCK_TYPE_FRAME); + + if (device->vk.enabled_extensions.KHR_ray_query) { + uint32_t ray_queries_size = + align_u32(brw_rt_ray_queries_hw_stacks_size(device->info), 4096); + + result = anv_device_alloc_bo(device, "ray queries", + ray_queries_size, + 0, + 0 /* explicit_address */, + &device->ray_query_bo); + if (result != VK_SUCCESS) + goto fail_workaround_bo; + } + + result = anv_device_init_trivial_batch(device); + if (result != VK_SUCCESS) + goto fail_ray_query_bo; + + if (device->info->ver >= 12 && + device->vk.enabled_extensions.KHR_fragment_shading_rate) { + uint32_t n_cps_states = 3 * 3; /* All combinaisons of X by Y CP sizes (1, 2, 4) */ + + if (device->info->has_coarse_pixel_primitive_and_cb) + n_cps_states *= 5 * 5; /* 5 combiners by 2 operators */ + + n_cps_states += 1; /* Disable CPS */ + + /* Each of the combinaison must be replicated on all viewports */ + n_cps_states *= MAX_VIEWPORTS; + + device->cps_states = + anv_state_pool_alloc(&device->dynamic_state_pool, + n_cps_states * CPS_STATE_length(device->info) * 4, + 32); + if (device->cps_states.map == NULL) + goto fail_trivial_batch; + + anv_genX(device->info, init_cps_device_state)(device); + } + + /* Allocate a null surface state at surface state offset 0. This makes + * NULL descriptor handling trivial because we can just memset structures + * to zero and they have a valid descriptor. + */ + device->null_surface_state = + anv_state_pool_alloc(&device->surface_state_pool, + device->isl_dev.ss.size, + device->isl_dev.ss.align); + isl_null_fill_state(&device->isl_dev, device->null_surface_state.map, + .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */); + assert(device->null_surface_state.offset == 0); + + anv_scratch_pool_init(device, &device->scratch_pool); + + /* TODO(RT): Do we want some sort of data structure for this? */ + memset(device->rt_scratch_bos, 0, sizeof(device->rt_scratch_bos)); + + result = anv_genX(device->info, init_device_state)(device); + if (result != VK_SUCCESS) + goto fail_trivial_batch_bo_and_scratch_pool; + + struct vk_pipeline_cache_create_info pcc_info = { }; + device->default_pipeline_cache = + vk_pipeline_cache_create(&device->vk, &pcc_info, NULL); + if (!device->default_pipeline_cache) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_trivial_batch_bo_and_scratch_pool; + } + + /* Internal shaders need their own pipeline cache because, unlike the rest + * of ANV, it won't work at all without the cache. It depends on it for + * shaders to remain resident while it runs. Therefore, we need a special + * cache just for BLORP/RT that's forced to always be enabled. + */ + pcc_info.force_enable = true; + device->internal_cache = + vk_pipeline_cache_create(&device->vk, &pcc_info, NULL); + if (device->internal_cache == NULL) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_default_pipeline_cache; + } + + result = anv_device_init_rt_shaders(device); + if (result != VK_SUCCESS) { + result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_internal_cache; + } + + anv_device_init_blorp(device); + + anv_device_init_border_colors(device); + + anv_device_perf_init(device); + + anv_device_utrace_init(device); + + *pDevice = anv_device_to_handle(device); + + return VK_SUCCESS; + + fail_internal_cache: + vk_pipeline_cache_destroy(device->internal_cache, NULL); + fail_default_pipeline_cache: + vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL); + fail_trivial_batch_bo_and_scratch_pool: + anv_scratch_pool_finish(device, &device->scratch_pool); + fail_trivial_batch: + anv_device_release_bo(device, device->trivial_batch_bo); + fail_ray_query_bo: + if (device->ray_query_bo) + anv_device_release_bo(device, device->ray_query_bo); + fail_workaround_bo: + anv_device_release_bo(device, device->workaround_bo); + fail_surface_aux_map_pool: + if (device->info->has_aux_map) { + intel_aux_map_finish(device->aux_map_ctx); + device->aux_map_ctx = NULL; + } + fail_binding_table_pool: + if (!anv_use_relocations(physical_device)) + anv_state_pool_finish(&device->binding_table_pool); + fail_surface_state_pool: + anv_state_pool_finish(&device->surface_state_pool); + fail_instruction_state_pool: + anv_state_pool_finish(&device->instruction_state_pool); + fail_dynamic_state_pool: + if (device->info->ver >= 8) + anv_state_reserved_pool_finish(&device->custom_border_colors); + anv_state_pool_finish(&device->dynamic_state_pool); + fail_general_state_pool: + anv_state_pool_finish(&device->general_state_pool); + fail_batch_bo_pool: + anv_bo_pool_finish(&device->batch_bo_pool); + anv_bo_cache_finish(&device->bo_cache); + fail_queue_cond: + pthread_cond_destroy(&device->queue_submit); + fail_mutex: + pthread_mutex_destroy(&device->mutex); + fail_vmas: + if (!anv_use_relocations(physical_device)) { + util_vma_heap_finish(&device->vma_hi); + util_vma_heap_finish(&device->vma_cva); + util_vma_heap_finish(&device->vma_lo); + } + fail_queues: + for (uint32_t i = 0; i < device->queue_count; i++) + anv_queue_finish(&device->queues[i]); + vk_free(&device->vk.alloc, device->queues); + fail_context_id: + anv_gem_destroy_context(device, device->context_id); + fail_fd: + close(device->fd); + fail_device: + vk_device_finish(&device->vk); + fail_alloc: + vk_free(&device->vk.alloc, device); + + return result; +} + +void anv_DestroyDevice( + VkDevice _device, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device) + return; + + anv_device_utrace_finish(device); + + anv_device_finish_blorp(device); + + anv_device_finish_rt_shaders(device); + + vk_pipeline_cache_destroy(device->internal_cache, NULL); + vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL); + +#ifdef HAVE_VALGRIND + /* We only need to free these to prevent valgrind errors. The backing + * BO will go away in a couple of lines so we don't actually leak. + */ + if (device->info->ver >= 8) + anv_state_reserved_pool_finish(&device->custom_border_colors); + anv_state_pool_free(&device->dynamic_state_pool, device->border_colors); + anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash); + anv_state_pool_free(&device->dynamic_state_pool, device->cps_states); +#endif + + for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) { + if (device->rt_scratch_bos[i] != NULL) + anv_device_release_bo(device, device->rt_scratch_bos[i]); + } + + anv_scratch_pool_finish(device, &device->scratch_pool); + + if (device->vk.enabled_extensions.KHR_ray_query) { + for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) { + if (device->ray_query_shadow_bos[i] != NULL) + anv_device_release_bo(device, device->ray_query_shadow_bos[i]); + } + anv_device_release_bo(device, device->ray_query_bo); + } + anv_device_release_bo(device, device->workaround_bo); + anv_device_release_bo(device, device->trivial_batch_bo); + + if (device->info->has_aux_map) { + intel_aux_map_finish(device->aux_map_ctx); + device->aux_map_ctx = NULL; + } + + if (!anv_use_relocations(device->physical)) + anv_state_pool_finish(&device->binding_table_pool); + anv_state_pool_finish(&device->surface_state_pool); + anv_state_pool_finish(&device->instruction_state_pool); + anv_state_pool_finish(&device->dynamic_state_pool); + anv_state_pool_finish(&device->general_state_pool); + + anv_bo_pool_finish(&device->batch_bo_pool); + + anv_bo_cache_finish(&device->bo_cache); + + if (!anv_use_relocations(device->physical)) { + util_vma_heap_finish(&device->vma_hi); + util_vma_heap_finish(&device->vma_cva); + util_vma_heap_finish(&device->vma_lo); + } + + pthread_cond_destroy(&device->queue_submit); + pthread_mutex_destroy(&device->mutex); + + for (uint32_t i = 0; i < device->queue_count; i++) + anv_queue_finish(&device->queues[i]); + vk_free(&device->vk.alloc, device->queues); + + anv_gem_destroy_context(device, device->context_id); + + if (INTEL_DEBUG(DEBUG_BATCH)) + intel_batch_decode_ctx_finish(&device->decoder_ctx); + + close(device->fd); + + vk_device_finish(&device->vk); + vk_free(&device->vk.alloc, device); +} + +VkResult anv_EnumerateInstanceLayerProperties( + uint32_t* pPropertyCount, + VkLayerProperties* pProperties) +{ + if (pProperties == NULL) { + *pPropertyCount = 0; + return VK_SUCCESS; + } + + /* None supported at this time */ + return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT); +} + +static VkResult +anv_device_check_status(struct vk_device *vk_device) +{ + struct anv_device *device = container_of(vk_device, struct anv_device, vk); + + uint32_t active, pending; + int ret = anv_gem_context_get_reset_stats(device->fd, device->context_id, + &active, &pending); + if (ret == -1) { + /* We don't know the real error. */ + return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m"); + } + + if (active) { + return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers"); + } else if (pending) { + return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight"); + } + + return VK_SUCCESS; +} + +VkResult +anv_device_wait(struct anv_device *device, struct anv_bo *bo, + int64_t timeout) +{ + int ret = anv_gem_wait(device, bo->gem_handle, &timeout); + if (ret == -1 && errno == ETIME) { + return VK_TIMEOUT; + } else if (ret == -1) { + /* We don't know the real error. */ + return vk_device_set_lost(&device->vk, "gem wait failed: %m"); + } else { + return VK_SUCCESS; + } +} + +uint64_t +anv_vma_alloc(struct anv_device *device, + uint64_t size, uint64_t align, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address) +{ + pthread_mutex_lock(&device->vma_mutex); + + uint64_t addr = 0; + + if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) { + if (client_address) { + if (util_vma_heap_alloc_addr(&device->vma_cva, + client_address, size)) { + addr = client_address; + } + } else { + addr = util_vma_heap_alloc(&device->vma_cva, size, align); + } + /* We don't want to fall back to other heaps */ + goto done; + } + + assert(client_address == 0); + + if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)) + addr = util_vma_heap_alloc(&device->vma_hi, size, align); + + if (addr == 0) + addr = util_vma_heap_alloc(&device->vma_lo, size, align); + +done: + pthread_mutex_unlock(&device->vma_mutex); + + assert(addr == intel_48b_address(addr)); + return intel_canonical_address(addr); +} + +void +anv_vma_free(struct anv_device *device, + uint64_t address, uint64_t size) +{ + const uint64_t addr_48b = intel_48b_address(address); + + pthread_mutex_lock(&device->vma_mutex); + + if (addr_48b >= LOW_HEAP_MIN_ADDRESS && + addr_48b <= LOW_HEAP_MAX_ADDRESS) { + util_vma_heap_free(&device->vma_lo, addr_48b, size); + } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS && + addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) { + util_vma_heap_free(&device->vma_cva, addr_48b, size); + } else { + assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS); + util_vma_heap_free(&device->vma_hi, addr_48b, size); + } + + pthread_mutex_unlock(&device->vma_mutex); +} + +VkResult anv_AllocateMemory( + VkDevice _device, + const VkMemoryAllocateInfo* pAllocateInfo, + const VkAllocationCallbacks* pAllocator, + VkDeviceMemory* pMem) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_physical_device *pdevice = device->physical; + struct anv_device_memory *mem; + VkResult result = VK_SUCCESS; + + assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO); + + /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */ + assert(pAllocateInfo->allocationSize > 0); + + VkDeviceSize aligned_alloc_size = + align_u64(pAllocateInfo->allocationSize, 4096); + + if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count); + struct anv_memory_type *mem_type = + &pdevice->memory.types[pAllocateInfo->memoryTypeIndex]; + assert(mem_type->heapIndex < pdevice->memory.heap_count); + struct anv_memory_heap *mem_heap = + &pdevice->memory.heaps[mem_type->heapIndex]; + + uint64_t mem_heap_used = p_atomic_read(&mem_heap->used); + if (mem_heap_used + aligned_alloc_size > mem_heap->size) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem), + VK_OBJECT_TYPE_DEVICE_MEMORY); + if (mem == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + mem->type = mem_type; + mem->map = NULL; + mem->map_size = 0; + mem->map_delta = 0; + mem->ahw = NULL; + mem->host_ptr = NULL; + + enum anv_bo_alloc_flags alloc_flags = 0; + + const VkExportMemoryAllocateInfo *export_info = NULL; + const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL; + const VkImportMemoryFdInfoKHR *fd_info = NULL; + const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL; + const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL; + VkMemoryAllocateFlags vk_flags = 0; + uint64_t client_address = 0; + + vk_foreach_struct_const(ext, pAllocateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO: + export_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID: + ahw_import_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR: + fd_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT: + host_ptr_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: { + const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext; + vk_flags = flags_info->flags; + break; + } + + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO: + dedicated_info = (void *)ext; + break; + + case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO: { + const VkMemoryOpaqueCaptureAddressAllocateInfo *addr_info = + (const VkMemoryOpaqueCaptureAddressAllocateInfo *)ext; + client_address = addr_info->opaqueCaptureAddress; + break; + } + + default: + if (ext->sType != VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA) + /* this isn't a real enum value, + * so use conditional to avoid compiler warn + */ + anv_debug_ignored_stype(ext->sType); + break; + } + } + + /* By default, we want all VkDeviceMemory objects to support CCS */ + if (device->physical->has_implicit_ccs && device->info->has_aux_map) + alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS; + + /* If i915 reported a mappable/non_mappable vram regions and the + * application want lmem mappable, then we need to use the + * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS flag to create our BO. + */ + if (pdevice->vram_mappable.size > 0 && + pdevice->vram_non_mappable.size > 0 && + (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) && + (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) + alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE; + + if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT) + alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS; + + if ((export_info && export_info->handleTypes) || + (fd_info && fd_info->handleType) || + (host_ptr_info && host_ptr_info->handleType)) { + /* Anything imported or exported is EXTERNAL */ + alloc_flags |= ANV_BO_ALLOC_EXTERNAL; + } + + /* Check if we need to support Android HW buffer export. If so, + * create AHardwareBuffer and import memory from it. + */ + bool android_export = false; + if (export_info && export_info->handleTypes & + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) + android_export = true; + + if (ahw_import_info) { + result = anv_import_ahw_memory(_device, mem, ahw_import_info); + if (result != VK_SUCCESS) + goto fail; + + goto success; + } else if (android_export) { + result = anv_create_ahw_memory(_device, mem, pAllocateInfo); + if (result != VK_SUCCESS) + goto fail; + + goto success; + } + + /* The Vulkan spec permits handleType to be 0, in which case the struct is + * ignored. + */ + if (fd_info && fd_info->handleType) { + /* At the moment, we support only the below handle types. */ + assert(fd_info->handleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || + fd_info->handleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); + + result = anv_device_import_bo(device, fd_info->fd, alloc_flags, + client_address, &mem->bo); + if (result != VK_SUCCESS) + goto fail; + + /* For security purposes, we reject importing the bo if it's smaller + * than the requested allocation size. This prevents a malicious client + * from passing a buffer to a trusted client, lying about the size, and + * telling the trusted client to try and texture from an image that goes + * out-of-bounds. This sort of thing could lead to GPU hangs or worse + * in the trusted client. The trusted client can protect itself against + * this sort of attack but only if it can trust the buffer size. + */ + if (mem->bo->size < aligned_alloc_size) { + result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "aligned allocationSize too large for " + "VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: " + "%"PRIu64"B > %"PRIu64"B", + aligned_alloc_size, mem->bo->size); + anv_device_release_bo(device, mem->bo); + goto fail; + } + + /* From the Vulkan spec: + * + * "Importing memory from a file descriptor transfers ownership of + * the file descriptor from the application to the Vulkan + * implementation. The application must not perform any operations on + * the file descriptor after a successful import." + * + * If the import fails, we leave the file descriptor open. + */ + close(fd_info->fd); + goto success; + } + + if (host_ptr_info && host_ptr_info->handleType) { + if (host_ptr_info->handleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) { + result = vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto fail; + } + + assert(host_ptr_info->handleType == + VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT); + + result = anv_device_import_bo_from_host_ptr(device, + host_ptr_info->pHostPointer, + pAllocateInfo->allocationSize, + alloc_flags, + client_address, + &mem->bo); + if (result != VK_SUCCESS) + goto fail; + + mem->host_ptr = host_ptr_info->pHostPointer; + goto success; + } + + /* Regular allocate (not importing memory). */ + + result = anv_device_alloc_bo(device, "user", pAllocateInfo->allocationSize, + alloc_flags, client_address, &mem->bo); + if (result != VK_SUCCESS) + goto fail; + + if (dedicated_info && dedicated_info->image != VK_NULL_HANDLE) { + ANV_FROM_HANDLE(anv_image, image, dedicated_info->image); + + /* Some legacy (non-modifiers) consumers need the tiling to be set on + * the BO. In this case, we have a dedicated allocation. + */ + if (image->vk.wsi_legacy_scanout) { + const struct isl_surf *surf = &image->planes[0].primary_surface.isl; + result = anv_device_set_bo_tiling(device, mem->bo, + surf->row_pitch_B, + surf->tiling); + if (result != VK_SUCCESS) { + anv_device_release_bo(device, mem->bo); + goto fail; + } + } + } + + success: + mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size); + if (mem_heap_used > mem_heap->size) { + p_atomic_add(&mem_heap->used, -mem->bo->size); + anv_device_release_bo(device, mem->bo); + result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Out of heap memory"); + goto fail; + } + + pthread_mutex_lock(&device->mutex); + list_addtail(&mem->link, &device->memory_objects); + pthread_mutex_unlock(&device->mutex); + + *pMem = anv_device_memory_to_handle(mem); + + return VK_SUCCESS; + + fail: + vk_object_free(&device->vk, pAllocator, mem); + + return result; +} + +VkResult anv_GetMemoryFdKHR( + VkDevice device_h, + const VkMemoryGetFdInfoKHR* pGetFdInfo, + int* pFd) +{ + ANV_FROM_HANDLE(anv_device, dev, device_h); + ANV_FROM_HANDLE(anv_device_memory, mem, pGetFdInfo->memory); + + assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR); + + assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT || + pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); + + return anv_device_export_bo(dev, mem->bo, pFd); +} + +VkResult anv_GetMemoryFdPropertiesKHR( + VkDevice _device, + VkExternalMemoryHandleTypeFlagBits handleType, + int fd, + VkMemoryFdPropertiesKHR* pMemoryFdProperties) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + switch (handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + /* dma-buf can be imported as any memory type */ + pMemoryFdProperties->memoryTypeBits = + (1 << device->physical->memory.type_count) - 1; + return VK_SUCCESS; + + default: + /* The valid usage section for this function says: + * + * "handleType must not be one of the handle types defined as + * opaque." + * + * So opaque handle types fall into the default "unsupported" case. + */ + return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE); + } +} + +VkResult anv_GetMemoryHostPointerPropertiesEXT( + VkDevice _device, + VkExternalMemoryHandleTypeFlagBits handleType, + const void* pHostPointer, + VkMemoryHostPointerPropertiesEXT* pMemoryHostPointerProperties) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + assert(pMemoryHostPointerProperties->sType == + VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT); + + switch (handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: + /* Host memory can be imported as any memory type. */ + pMemoryHostPointerProperties->memoryTypeBits = + (1ull << device->physical->memory.type_count) - 1; + + return VK_SUCCESS; + + default: + return VK_ERROR_INVALID_EXTERNAL_HANDLE; + } +} + +void anv_FreeMemory( + VkDevice _device, + VkDeviceMemory _mem, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_device_memory, mem, _mem); + + if (mem == NULL) + return; + + pthread_mutex_lock(&device->mutex); + list_del(&mem->link); + pthread_mutex_unlock(&device->mutex); + + if (mem->map) + anv_UnmapMemory(_device, _mem); + + p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used, + -mem->bo->size); + + anv_device_release_bo(device, mem->bo); + +#if defined(ANDROID) && ANDROID_API_LEVEL >= 26 + if (mem->ahw) + AHardwareBuffer_release(mem->ahw); +#endif + + vk_object_free(&device->vk, pAllocator, mem); +} + +VkResult anv_MapMemory( + VkDevice _device, + VkDeviceMemory _memory, + VkDeviceSize offset, + VkDeviceSize size, + VkMemoryMapFlags flags, + void** ppData) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_device_memory, mem, _memory); + + if (mem == NULL) { + *ppData = NULL; + return VK_SUCCESS; + } + + if (mem->host_ptr) { + *ppData = mem->host_ptr + offset; + return VK_SUCCESS; + } + + /* From the Vulkan spec version 1.0.32 docs for MapMemory: + * + * * memory must have been created with a memory type that reports + * VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT + */ + if (!(mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) { + return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, + "Memory object not mappable."); + } + + if (size == VK_WHOLE_SIZE) + size = mem->bo->size - offset; + + /* From the Vulkan spec version 1.0.32 docs for MapMemory: + * + * * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0 + * assert(size != 0); + * * If size is not equal to VK_WHOLE_SIZE, size must be less than or + * equal to the size of the memory minus offset + */ + assert(size > 0); + assert(offset + size <= mem->bo->size); + + if (size != (size_t)size) { + return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, + "requested size 0x%"PRIx64" does not fit in %u bits", + size, (unsigned)(sizeof(size_t) * 8)); + } + + /* From the Vulkan 1.2.194 spec: + * + * "memory must not be currently host mapped" + */ + if (mem->map != NULL) { + return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, + "Memory object already mapped."); + } + + uint32_t gem_flags = 0; + + if (!device->info->has_llc && + (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) + gem_flags |= I915_MMAP_WC; + + /* GEM will fail to map if the offset isn't 4k-aligned. Round down. */ + uint64_t map_offset; + if (!device->physical->has_mmap_offset) + map_offset = offset & ~4095ull; + else + map_offset = 0; + assert(offset >= map_offset); + uint64_t map_size = (offset + size) - map_offset; + + /* Let's map whole pages */ + map_size = align_u64(map_size, 4096); + + void *map; + VkResult result = anv_device_map_bo(device, mem->bo, map_offset, + map_size, gem_flags, &map); + if (result != VK_SUCCESS) + return result; + + mem->map = map; + mem->map_size = map_size; + mem->map_delta = (offset - map_offset); + *ppData = mem->map + mem->map_delta; + + return VK_SUCCESS; +} + +void anv_UnmapMemory( + VkDevice _device, + VkDeviceMemory _memory) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_device_memory, mem, _memory); + + if (mem == NULL || mem->host_ptr) + return; + + anv_device_unmap_bo(device, mem->bo, mem->map, mem->map_size); + + mem->map = NULL; + mem->map_size = 0; + mem->map_delta = 0; +} + +VkResult anv_FlushMappedMemoryRanges( + VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange* pMemoryRanges) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->memory.need_clflush) + return VK_SUCCESS; + + /* Make sure the writes we're flushing have landed. */ + __builtin_ia32_mfence(); + + for (uint32_t i = 0; i < memoryRangeCount; i++) { + ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory); + if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) + continue; + + uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta; + if (map_offset >= mem->map_size) + continue; + + intel_clflush_range(mem->map + map_offset, + MIN2(pMemoryRanges[i].size, + mem->map_size - map_offset)); + } + + return VK_SUCCESS; +} + +VkResult anv_InvalidateMappedMemoryRanges( + VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange* pMemoryRanges) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->memory.need_clflush) + return VK_SUCCESS; + + for (uint32_t i = 0; i < memoryRangeCount; i++) { + ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory); + if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) + continue; + + uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta; + if (map_offset >= mem->map_size) + continue; + + intel_invalidate_range(mem->map + map_offset, + MIN2(pMemoryRanges[i].size, + mem->map_size - map_offset)); + } + + /* Make sure no reads get moved up above the invalidate. */ + __builtin_ia32_mfence(); + + return VK_SUCCESS; +} + +void anv_GetDeviceMemoryCommitment( + VkDevice device, + VkDeviceMemory memory, + VkDeviceSize* pCommittedMemoryInBytes) +{ + *pCommittedMemoryInBytes = 0; +} + +static void +anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo) +{ + ANV_FROM_HANDLE(anv_device_memory, mem, pBindInfo->memory); + ANV_FROM_HANDLE(anv_buffer, buffer, pBindInfo->buffer); + + assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO); + + if (mem) { + assert(pBindInfo->memoryOffset < mem->bo->size); + assert(mem->bo->size - pBindInfo->memoryOffset >= buffer->vk.size); + buffer->address = (struct anv_address) { + .bo = mem->bo, + .offset = pBindInfo->memoryOffset, + }; + } else { + buffer->address = ANV_NULL_ADDRESS; + } +} + +VkResult anv_BindBufferMemory2( + VkDevice device, + uint32_t bindInfoCount, + const VkBindBufferMemoryInfo* pBindInfos) +{ + for (uint32_t i = 0; i < bindInfoCount; i++) + anv_bind_buffer_memory(&pBindInfos[i]); + + return VK_SUCCESS; +} + +VkResult anv_QueueBindSparse( + VkQueue _queue, + uint32_t bindInfoCount, + const VkBindSparseInfo* pBindInfo, + VkFence fence) +{ + ANV_FROM_HANDLE(anv_queue, queue, _queue); + if (vk_device_is_lost(&queue->device->vk)) + return VK_ERROR_DEVICE_LOST; + + return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT); +} + +// Event functions + +VkResult anv_CreateEvent( + VkDevice _device, + const VkEventCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkEvent* pEvent) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_event *event; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_EVENT_CREATE_INFO); + + event = vk_object_alloc(&device->vk, pAllocator, sizeof(*event), + VK_OBJECT_TYPE_EVENT); + if (event == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + event->state = anv_state_pool_alloc(&device->dynamic_state_pool, + sizeof(uint64_t), 8); + *(uint64_t *)event->state.map = VK_EVENT_RESET; + + *pEvent = anv_event_to_handle(event); + + return VK_SUCCESS; +} + +void anv_DestroyEvent( + VkDevice _device, + VkEvent _event, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_event, event, _event); + + if (!event) + return; + + anv_state_pool_free(&device->dynamic_state_pool, event->state); + + vk_object_free(&device->vk, pAllocator, event); +} + +VkResult anv_GetEventStatus( + VkDevice _device, + VkEvent _event) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_event, event, _event); + + if (vk_device_is_lost(&device->vk)) + return VK_ERROR_DEVICE_LOST; + + return *(uint64_t *)event->state.map; +} + +VkResult anv_SetEvent( + VkDevice _device, + VkEvent _event) +{ + ANV_FROM_HANDLE(anv_event, event, _event); + + *(uint64_t *)event->state.map = VK_EVENT_SET; + + return VK_SUCCESS; +} + +VkResult anv_ResetEvent( + VkDevice _device, + VkEvent _event) +{ + ANV_FROM_HANDLE(anv_event, event, _event); + + *(uint64_t *)event->state.map = VK_EVENT_RESET; + + return VK_SUCCESS; +} + +// Buffer functions + +static void +anv_get_buffer_memory_requirements(struct anv_device *device, + VkDeviceSize size, + VkBufferUsageFlags usage, + VkMemoryRequirements2* pMemoryRequirements) +{ + /* The Vulkan spec (git aaed022) says: + * + * memoryTypeBits is a bitfield and contains one bit set for every + * supported memory type for the resource. The bit `1<physical->memory.type_count) - 1; + + /* Base alignment requirement of a cache line */ + uint32_t alignment = 16; + + if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) + alignment = MAX2(alignment, ANV_UBO_ALIGNMENT); + + pMemoryRequirements->memoryRequirements.size = size; + pMemoryRequirements->memoryRequirements.alignment = alignment; + + /* Storage and Uniform buffers should have their size aligned to + * 32-bits to avoid boundary checks when last DWord is not complete. + * This would ensure that not internal padding would be needed for + * 16-bit types. + */ + if (device->robust_buffer_access && + (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT || + usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)) + pMemoryRequirements->memoryRequirements.size = align_u64(size, 4); + + pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types; + + vk_foreach_struct(ext, pMemoryRequirements->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { + VkMemoryDedicatedRequirements *requirements = (void *)ext; + requirements->prefersDedicatedAllocation = false; + requirements->requiresDedicatedAllocation = false; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } +} + +void anv_GetBufferMemoryRequirements2( + VkDevice _device, + const VkBufferMemoryRequirementsInfo2* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer); + + anv_get_buffer_memory_requirements(device, + buffer->vk.size, + buffer->vk.usage, + pMemoryRequirements); +} + +void anv_GetDeviceBufferMemoryRequirementsKHR( + VkDevice _device, + const VkDeviceBufferMemoryRequirements* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + anv_get_buffer_memory_requirements(device, + pInfo->pCreateInfo->size, + pInfo->pCreateInfo->usage, + pMemoryRequirements); +} + +VkResult anv_CreateBuffer( + VkDevice _device, + const VkBufferCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkBuffer* pBuffer) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_buffer *buffer; + + /* Don't allow creating buffers bigger than our address space. The real + * issue here is that we may align up the buffer size and we don't want + * doing so to cause roll-over. However, no one has any business + * allocating a buffer larger than our GTT size. + */ + if (pCreateInfo->size > device->physical->gtt_size) + return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + buffer = vk_buffer_create(&device->vk, pCreateInfo, + pAllocator, sizeof(*buffer)); + if (buffer == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + buffer->address = ANV_NULL_ADDRESS; + + *pBuffer = anv_buffer_to_handle(buffer); + + return VK_SUCCESS; +} + +void anv_DestroyBuffer( + VkDevice _device, + VkBuffer _buffer, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + + if (!buffer) + return; + + vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk); +} + +VkDeviceAddress anv_GetBufferDeviceAddress( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo) +{ + ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer); + + assert(!anv_address_is_null(buffer->address)); + assert(anv_bo_is_pinned(buffer->address.bo)); + + return anv_address_physical(buffer->address); +} + +uint64_t anv_GetBufferOpaqueCaptureAddress( + VkDevice device, + const VkBufferDeviceAddressInfo* pInfo) +{ + return 0; +} + +uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo) +{ + ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory); + + assert(anv_bo_is_pinned(memory->bo)); + assert(memory->bo->has_client_visible_address); + + return intel_48b_address(memory->bo->offset); +} + +void +anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state, + enum isl_format format, + struct isl_swizzle swizzle, + isl_surf_usage_flags_t usage, + struct anv_address address, + uint32_t range, uint32_t stride) +{ + isl_buffer_fill_state(&device->isl_dev, state.map, + .address = anv_address_physical(address), + .mocs = isl_mocs(&device->isl_dev, usage, + address.bo && address.bo->is_external), + .size_B = range, + .format = format, + .swizzle = swizzle, + .stride_B = stride); +} + +void anv_DestroySampler( + VkDevice _device, + VkSampler _sampler, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_sampler, sampler, _sampler); + + if (!sampler) + return; + + if (sampler->bindless_state.map) { + anv_state_pool_free(&device->dynamic_state_pool, + sampler->bindless_state); + } + + if (sampler->custom_border_color.map) { + anv_state_reserved_pool_free(&device->custom_border_colors, + sampler->custom_border_color); + } + + vk_object_free(&device->vk, pAllocator, sampler); +} + +static const VkTimeDomainEXT anv_time_domains[] = { + VK_TIME_DOMAIN_DEVICE_EXT, + VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT, +#ifdef CLOCK_MONOTONIC_RAW + VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT, +#endif +}; + +VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT( + VkPhysicalDevice physicalDevice, + uint32_t *pTimeDomainCount, + VkTimeDomainEXT *pTimeDomains) +{ + int d; + VK_OUTARRAY_MAKE_TYPED(VkTimeDomainEXT, out, pTimeDomains, pTimeDomainCount); + + for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) { + vk_outarray_append_typed(VkTimeDomainEXT, &out, i) { + *i = anv_time_domains[d]; + } + } + + return vk_outarray_status(&out); +} + +static uint64_t +anv_clock_gettime(clockid_t clock_id) +{ + struct timespec current; + int ret; + + ret = clock_gettime(clock_id, ¤t); +#ifdef CLOCK_MONOTONIC_RAW + if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW) + ret = clock_gettime(CLOCK_MONOTONIC, ¤t); +#endif + if (ret < 0) + return 0; + + return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec; +} + +VkResult anv_GetCalibratedTimestampsEXT( + VkDevice _device, + uint32_t timestampCount, + const VkCalibratedTimestampInfoEXT *pTimestampInfos, + uint64_t *pTimestamps, + uint64_t *pMaxDeviation) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + uint64_t timestamp_frequency = device->info->timestamp_frequency; + int ret; + int d; + uint64_t begin, end; + uint64_t max_clock_period = 0; + +#ifdef CLOCK_MONOTONIC_RAW + begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW); +#else + begin = anv_clock_gettime(CLOCK_MONOTONIC); +#endif + + for (d = 0; d < timestampCount; d++) { + switch (pTimestampInfos[d].timeDomain) { + case VK_TIME_DOMAIN_DEVICE_EXT: + ret = anv_gem_reg_read(device->fd, TIMESTAMP | I915_REG_READ_8B_WA, + &pTimestamps[d]); + + if (ret != 0) { + return vk_device_set_lost(&device->vk, "Failed to read the " + "TIMESTAMP register: %m"); + } + uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency); + max_clock_period = MAX2(max_clock_period, device_period); + break; + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT: + pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC); + max_clock_period = MAX2(max_clock_period, 1); + break; + +#ifdef CLOCK_MONOTONIC_RAW + case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT: + pTimestamps[d] = begin; + break; +#endif + default: + pTimestamps[d] = 0; + break; + } + } + +#ifdef CLOCK_MONOTONIC_RAW + end = anv_clock_gettime(CLOCK_MONOTONIC_RAW); +#else + end = anv_clock_gettime(CLOCK_MONOTONIC); +#endif + + /* + * The maximum deviation is the sum of the interval over which we + * perform the sampling and the maximum period of any sampled + * clock. That's because the maximum skew between any two sampled + * clock edges is when the sampled clock with the largest period is + * sampled at the end of that period but right at the beginning of the + * sampling interval and some other clock is sampled right at the + * beginning of its sampling period and right at the end of the + * sampling interval. Let's assume the GPU has the longest clock + * period and that the application is sampling GPU and monotonic: + * + * s e + * w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f + * Raw -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_- + * + * g + * 0 1 2 3 + * GPU -----_____-----_____-----_____-----_____ + * + * m + * x y z 0 1 2 3 4 5 6 7 8 9 a b c + * Monotonic -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_- + * + * Interval <-----------------> + * Deviation <--------------------------> + * + * s = read(raw) 2 + * g = read(GPU) 1 + * m = read(monotonic) 2 + * e = read(raw) b + * + * We round the sample interval up by one tick to cover sampling error + * in the interval clock + */ + + uint64_t sample_interval = end - begin + 1; + + *pMaxDeviation = sample_interval + max_clock_period; + + return VK_SUCCESS; +} + +void anv_GetPhysicalDeviceMultisamplePropertiesEXT( + VkPhysicalDevice physicalDevice, + VkSampleCountFlagBits samples, + VkMultisamplePropertiesEXT* pMultisampleProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + + assert(pMultisampleProperties->sType == + VK_STRUCTURE_TYPE_MULTISAMPLE_PROPERTIES_EXT); + + VkExtent2D grid_size; + if (samples & isl_device_get_sample_counts(&physical_device->isl_dev)) { + grid_size.width = 1; + grid_size.height = 1; + } else { + grid_size.width = 0; + grid_size.height = 0; + } + pMultisampleProperties->maxSampleLocationGridSize = grid_size; + + vk_foreach_struct(ext, pMultisampleProperties->pNext) + anv_debug_ignored_stype(ext->sType); +} + +/* vk_icd.h does not declare this function, so we declare it here to + * suppress Wmissing-prototypes. + */ +PUBLIC VKAPI_ATTR VkResult VKAPI_CALL +vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion); + +PUBLIC VKAPI_ATTR VkResult VKAPI_CALL +vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion) +{ + /* For the full details on loader interface versioning, see + * . + * What follows is a condensed summary, to help you navigate the large and + * confusing official doc. + * + * - Loader interface v0 is incompatible with later versions. We don't + * support it. + * + * - In loader interface v1: + * - The first ICD entrypoint called by the loader is + * vk_icdGetInstanceProcAddr(). The ICD must statically expose this + * entrypoint. + * - The ICD must statically expose no other Vulkan symbol unless it is + * linked with -Bsymbolic. + * - Each dispatchable Vulkan handle created by the ICD must be + * a pointer to a struct whose first member is VK_LOADER_DATA. The + * ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC. + * - The loader implements vkCreate{PLATFORM}SurfaceKHR() and + * vkDestroySurfaceKHR(). The ICD must be capable of working with + * such loader-managed surfaces. + * + * - Loader interface v2 differs from v1 in: + * - The first ICD entrypoint called by the loader is + * vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must + * statically expose this entrypoint. + * + * - Loader interface v3 differs from v2 in: + * - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(), + * vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR, + * because the loader no longer does so. + * + * - Loader interface v4 differs from v3 in: + * - The ICD must implement vk_icdGetPhysicalDeviceProcAddr(). + * + * - Loader interface v5 differs from v4 in: + * - The ICD must support Vulkan API version 1.1 and must not return + * VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a + * Vulkan Loader with interface v4 or smaller is being used and the + * application provides an API version that is greater than 1.0. + */ + *pSupportedVersion = MIN2(*pSupportedVersion, 5u); + return VK_SUCCESS; +} + +VkResult anv_GetPhysicalDeviceFragmentShadingRatesKHR( + VkPhysicalDevice physicalDevice, + uint32_t* pFragmentShadingRateCount, + VkPhysicalDeviceFragmentShadingRateKHR* pFragmentShadingRates) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out, + pFragmentShadingRates, pFragmentShadingRateCount); + +#define append_rate(_samples, _width, _height) \ + do { \ + vk_outarray_append_typed(VkPhysicalDeviceFragmentShadingRateKHR, &out, __r) { \ + __r->sampleCounts = _samples; \ + __r->fragmentSize = (VkExtent2D) { \ + .width = _width, \ + .height = _height, \ + }; \ + } \ + } while (0) + + VkSampleCountFlags sample_counts = + isl_device_get_sample_counts(&physical_device->isl_dev); + + /* BSpec 47003: There are a number of restrictions on the sample count + * based off the coarse pixel size. + */ + static const VkSampleCountFlags cp_size_sample_limits[] = { + [1] = ISL_SAMPLE_COUNT_16_BIT | ISL_SAMPLE_COUNT_8_BIT | + ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT, + [2] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT, + [4] = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT, + [8] = ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT, + [16] = ISL_SAMPLE_COUNT_1_BIT, + }; + + for (uint32_t x = 4; x >= 1; x /= 2) { + for (uint32_t y = 4; y >= 1; y /= 2) { + if (physical_device->info.has_coarse_pixel_primitive_and_cb) { + /* BSpec 47003: + * "CPsize 1x4 and 4x1 are not supported" + */ + if ((x == 1 && y == 4) || (x == 4 && y == 1)) + continue; + + /* For size {1, 1}, the sample count must be ~0 + * + * 4x2 is also a specially case. + */ + if (x == 1 && y == 1) + append_rate(~0, x, y); + else if (x == 4 && y == 2) + append_rate(ISL_SAMPLE_COUNT_1_BIT, x, y); + else + append_rate(cp_size_sample_limits[x * y], x, y); + } else { + /* For size {1, 1}, the sample count must be ~0 */ + if (x == 1 && y == 1) + append_rate(~0, x, y); + else + append_rate(sample_counts, x, y); + } + } + } + +#undef append_rate + + return vk_outarray_status(&out); +} diff --git a/src/intel/vulkan_hasvk/anv_formats.c b/src/intel/vulkan_hasvk/anv_formats.c new file mode 100644 index 00000000000..029a6080926 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_formats.c @@ -0,0 +1,1745 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" +#include "drm-uapi/drm_fourcc.h" +#include "vk_enum_defines.h" +#include "vk_enum_to_str.h" +#include "vk_format.h" +#include "vk_util.h" + +/* + * gcc-4 and earlier don't allow compound literals where a constant + * is required in -std=c99/gnu99 mode, so we can't use ISL_SWIZZLE() + * here. -std=c89/gnu89 would allow it, but we depend on c99 features + * so using -std=c89/gnu89 is not an option. Starting from gcc-5 + * compound literals can also be considered constant in -std=c99/gnu99 + * mode. + */ +#define _ISL_SWIZZLE(r, g, b, a) { \ + ISL_CHANNEL_SELECT_##r, \ + ISL_CHANNEL_SELECT_##g, \ + ISL_CHANNEL_SELECT_##b, \ + ISL_CHANNEL_SELECT_##a, \ +} + +#define RGBA _ISL_SWIZZLE(RED, GREEN, BLUE, ALPHA) +#define BGRA _ISL_SWIZZLE(BLUE, GREEN, RED, ALPHA) +#define RGB1 _ISL_SWIZZLE(RED, GREEN, BLUE, ONE) + +#define swiz_fmt1(__vk_fmt, __hw_fmt, __swizzle) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + { .isl_format = __hw_fmt, .swizzle = __swizzle, \ + .denominator_scales = { 1, 1, }, \ + .aspect = VK_IMAGE_ASPECT_COLOR_BIT, \ + }, \ + }, \ + .vk_format = __vk_fmt, \ + .n_planes = 1, \ + } + +#define fmt1(__vk_fmt, __hw_fmt) \ + swiz_fmt1(__vk_fmt, __hw_fmt, RGBA) + +#define d_fmt(__vk_fmt, __hw_fmt) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + { .isl_format = __hw_fmt, .swizzle = RGBA, \ + .denominator_scales = { 1, 1, }, \ + .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \ + }, \ + }, \ + .vk_format = __vk_fmt, \ + .n_planes = 1, \ + } + +#define s_fmt(__vk_fmt, __hw_fmt) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + { .isl_format = __hw_fmt, .swizzle = RGBA, \ + .denominator_scales = { 1, 1, }, \ + .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \ + }, \ + }, \ + .vk_format = __vk_fmt, \ + .n_planes = 1, \ + } + +#define ds_fmt2(__vk_fmt, __fmt1, __fmt2) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + { .isl_format = __fmt1, .swizzle = RGBA, \ + .denominator_scales = { 1, 1, }, \ + .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \ + }, \ + { .isl_format = __fmt2, .swizzle = RGBA, \ + .denominator_scales = { 1, 1, }, \ + .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \ + }, \ + }, \ + .vk_format = __vk_fmt, \ + .n_planes = 2, \ + } + +#define fmt_unsupported(__vk_fmt) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + { .isl_format = ISL_FORMAT_UNSUPPORTED, }, \ + }, \ + .vk_format = VK_FORMAT_UNDEFINED, \ + } + +#define y_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \ + { .isl_format = __hw_fmt, \ + .swizzle = __swizzle, \ + .ycbcr_swizzle = __ycbcr_swizzle, \ + .denominator_scales = { dhs, dvs, }, \ + .has_chroma = false, \ + .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT, /* Y plane is always plane 0 */ \ + } + +#define chroma_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \ + { .isl_format = __hw_fmt, \ + .swizzle = __swizzle, \ + .ycbcr_swizzle = __ycbcr_swizzle, \ + .denominator_scales = { dhs, dvs, }, \ + .has_chroma = true, \ + .aspect = VK_IMAGE_ASPECT_PLANE_ ## __plane ## _BIT, \ + } + +#define ycbcr_fmt(__vk_fmt, __n_planes, ...) \ + [VK_ENUM_OFFSET(__vk_fmt)] = { \ + .planes = { \ + __VA_ARGS__, \ + }, \ + .vk_format = __vk_fmt, \ + .n_planes = __n_planes, \ + .can_ycbcr = true, \ + } + +/* HINT: For array formats, the ISL name should match the VK name. For + * packed formats, they should have the channels in reverse order from each + * other. The reason for this is that, for packed formats, the ISL (and + * bspec) names are in LSB -> MSB order while VK formats are MSB -> LSB. + */ +static const struct anv_format main_formats[] = { + fmt_unsupported(VK_FORMAT_UNDEFINED), + fmt_unsupported(VK_FORMAT_R4G4_UNORM_PACK8), + fmt1(VK_FORMAT_R4G4B4A4_UNORM_PACK16, ISL_FORMAT_A4B4G4R4_UNORM), + swiz_fmt1(VK_FORMAT_B4G4R4A4_UNORM_PACK16, ISL_FORMAT_A4B4G4R4_UNORM, BGRA), + fmt1(VK_FORMAT_R5G6B5_UNORM_PACK16, ISL_FORMAT_B5G6R5_UNORM), + swiz_fmt1(VK_FORMAT_B5G6R5_UNORM_PACK16, ISL_FORMAT_B5G6R5_UNORM, BGRA), + fmt1(VK_FORMAT_R5G5B5A1_UNORM_PACK16, ISL_FORMAT_A1B5G5R5_UNORM), + swiz_fmt1(VK_FORMAT_B5G5R5A1_UNORM_PACK16, ISL_FORMAT_A1B5G5R5_UNORM, BGRA), + fmt1(VK_FORMAT_A1R5G5B5_UNORM_PACK16, ISL_FORMAT_B5G5R5A1_UNORM), + fmt1(VK_FORMAT_R8_UNORM, ISL_FORMAT_R8_UNORM), + fmt1(VK_FORMAT_R8_SNORM, ISL_FORMAT_R8_SNORM), + fmt1(VK_FORMAT_R8_USCALED, ISL_FORMAT_R8_USCALED), + fmt1(VK_FORMAT_R8_SSCALED, ISL_FORMAT_R8_SSCALED), + fmt1(VK_FORMAT_R8_UINT, ISL_FORMAT_R8_UINT), + fmt1(VK_FORMAT_R8_SINT, ISL_FORMAT_R8_SINT), + swiz_fmt1(VK_FORMAT_R8_SRGB, ISL_FORMAT_L8_UNORM_SRGB, + _ISL_SWIZZLE(RED, ZERO, ZERO, ONE)), + fmt1(VK_FORMAT_R8G8_UNORM, ISL_FORMAT_R8G8_UNORM), + fmt1(VK_FORMAT_R8G8_SNORM, ISL_FORMAT_R8G8_SNORM), + fmt1(VK_FORMAT_R8G8_USCALED, ISL_FORMAT_R8G8_USCALED), + fmt1(VK_FORMAT_R8G8_SSCALED, ISL_FORMAT_R8G8_SSCALED), + fmt1(VK_FORMAT_R8G8_UINT, ISL_FORMAT_R8G8_UINT), + fmt1(VK_FORMAT_R8G8_SINT, ISL_FORMAT_R8G8_SINT), + fmt_unsupported(VK_FORMAT_R8G8_SRGB), /* L8A8_UNORM_SRGB */ + fmt1(VK_FORMAT_R8G8B8_UNORM, ISL_FORMAT_R8G8B8_UNORM), + fmt1(VK_FORMAT_R8G8B8_SNORM, ISL_FORMAT_R8G8B8_SNORM), + fmt1(VK_FORMAT_R8G8B8_USCALED, ISL_FORMAT_R8G8B8_USCALED), + fmt1(VK_FORMAT_R8G8B8_SSCALED, ISL_FORMAT_R8G8B8_SSCALED), + fmt1(VK_FORMAT_R8G8B8_UINT, ISL_FORMAT_R8G8B8_UINT), + fmt1(VK_FORMAT_R8G8B8_SINT, ISL_FORMAT_R8G8B8_SINT), + fmt1(VK_FORMAT_R8G8B8_SRGB, ISL_FORMAT_R8G8B8_UNORM_SRGB), + fmt1(VK_FORMAT_R8G8B8A8_UNORM, ISL_FORMAT_R8G8B8A8_UNORM), + fmt1(VK_FORMAT_R8G8B8A8_SNORM, ISL_FORMAT_R8G8B8A8_SNORM), + fmt1(VK_FORMAT_R8G8B8A8_USCALED, ISL_FORMAT_R8G8B8A8_USCALED), + fmt1(VK_FORMAT_R8G8B8A8_SSCALED, ISL_FORMAT_R8G8B8A8_SSCALED), + fmt1(VK_FORMAT_R8G8B8A8_UINT, ISL_FORMAT_R8G8B8A8_UINT), + fmt1(VK_FORMAT_R8G8B8A8_SINT, ISL_FORMAT_R8G8B8A8_SINT), + fmt1(VK_FORMAT_R8G8B8A8_SRGB, ISL_FORMAT_R8G8B8A8_UNORM_SRGB), + fmt1(VK_FORMAT_A8B8G8R8_UNORM_PACK32, ISL_FORMAT_R8G8B8A8_UNORM), + fmt1(VK_FORMAT_A8B8G8R8_SNORM_PACK32, ISL_FORMAT_R8G8B8A8_SNORM), + fmt1(VK_FORMAT_A8B8G8R8_USCALED_PACK32, ISL_FORMAT_R8G8B8A8_USCALED), + fmt1(VK_FORMAT_A8B8G8R8_SSCALED_PACK32, ISL_FORMAT_R8G8B8A8_SSCALED), + fmt1(VK_FORMAT_A8B8G8R8_UINT_PACK32, ISL_FORMAT_R8G8B8A8_UINT), + fmt1(VK_FORMAT_A8B8G8R8_SINT_PACK32, ISL_FORMAT_R8G8B8A8_SINT), + fmt1(VK_FORMAT_A8B8G8R8_SRGB_PACK32, ISL_FORMAT_R8G8B8A8_UNORM_SRGB), + fmt1(VK_FORMAT_A2R10G10B10_UNORM_PACK32, ISL_FORMAT_B10G10R10A2_UNORM), + fmt1(VK_FORMAT_A2R10G10B10_SNORM_PACK32, ISL_FORMAT_B10G10R10A2_SNORM), + fmt1(VK_FORMAT_A2R10G10B10_USCALED_PACK32, ISL_FORMAT_B10G10R10A2_USCALED), + fmt1(VK_FORMAT_A2R10G10B10_SSCALED_PACK32, ISL_FORMAT_B10G10R10A2_SSCALED), + fmt1(VK_FORMAT_A2R10G10B10_UINT_PACK32, ISL_FORMAT_B10G10R10A2_UINT), + fmt1(VK_FORMAT_A2R10G10B10_SINT_PACK32, ISL_FORMAT_B10G10R10A2_SINT), + fmt1(VK_FORMAT_A2B10G10R10_UNORM_PACK32, ISL_FORMAT_R10G10B10A2_UNORM), + fmt1(VK_FORMAT_A2B10G10R10_SNORM_PACK32, ISL_FORMAT_R10G10B10A2_SNORM), + fmt1(VK_FORMAT_A2B10G10R10_USCALED_PACK32, ISL_FORMAT_R10G10B10A2_USCALED), + fmt1(VK_FORMAT_A2B10G10R10_SSCALED_PACK32, ISL_FORMAT_R10G10B10A2_SSCALED), + fmt1(VK_FORMAT_A2B10G10R10_UINT_PACK32, ISL_FORMAT_R10G10B10A2_UINT), + fmt1(VK_FORMAT_A2B10G10R10_SINT_PACK32, ISL_FORMAT_R10G10B10A2_SINT), + fmt1(VK_FORMAT_R16_UNORM, ISL_FORMAT_R16_UNORM), + fmt1(VK_FORMAT_R16_SNORM, ISL_FORMAT_R16_SNORM), + fmt1(VK_FORMAT_R16_USCALED, ISL_FORMAT_R16_USCALED), + fmt1(VK_FORMAT_R16_SSCALED, ISL_FORMAT_R16_SSCALED), + fmt1(VK_FORMAT_R16_UINT, ISL_FORMAT_R16_UINT), + fmt1(VK_FORMAT_R16_SINT, ISL_FORMAT_R16_SINT), + fmt1(VK_FORMAT_R16_SFLOAT, ISL_FORMAT_R16_FLOAT), + fmt1(VK_FORMAT_R16G16_UNORM, ISL_FORMAT_R16G16_UNORM), + fmt1(VK_FORMAT_R16G16_SNORM, ISL_FORMAT_R16G16_SNORM), + fmt1(VK_FORMAT_R16G16_USCALED, ISL_FORMAT_R16G16_USCALED), + fmt1(VK_FORMAT_R16G16_SSCALED, ISL_FORMAT_R16G16_SSCALED), + fmt1(VK_FORMAT_R16G16_UINT, ISL_FORMAT_R16G16_UINT), + fmt1(VK_FORMAT_R16G16_SINT, ISL_FORMAT_R16G16_SINT), + fmt1(VK_FORMAT_R16G16_SFLOAT, ISL_FORMAT_R16G16_FLOAT), + fmt1(VK_FORMAT_R16G16B16_UNORM, ISL_FORMAT_R16G16B16_UNORM), + fmt1(VK_FORMAT_R16G16B16_SNORM, ISL_FORMAT_R16G16B16_SNORM), + fmt1(VK_FORMAT_R16G16B16_USCALED, ISL_FORMAT_R16G16B16_USCALED), + fmt1(VK_FORMAT_R16G16B16_SSCALED, ISL_FORMAT_R16G16B16_SSCALED), + fmt1(VK_FORMAT_R16G16B16_UINT, ISL_FORMAT_R16G16B16_UINT), + fmt1(VK_FORMAT_R16G16B16_SINT, ISL_FORMAT_R16G16B16_SINT), + fmt1(VK_FORMAT_R16G16B16_SFLOAT, ISL_FORMAT_R16G16B16_FLOAT), + fmt1(VK_FORMAT_R16G16B16A16_UNORM, ISL_FORMAT_R16G16B16A16_UNORM), + fmt1(VK_FORMAT_R16G16B16A16_SNORM, ISL_FORMAT_R16G16B16A16_SNORM), + fmt1(VK_FORMAT_R16G16B16A16_USCALED, ISL_FORMAT_R16G16B16A16_USCALED), + fmt1(VK_FORMAT_R16G16B16A16_SSCALED, ISL_FORMAT_R16G16B16A16_SSCALED), + fmt1(VK_FORMAT_R16G16B16A16_UINT, ISL_FORMAT_R16G16B16A16_UINT), + fmt1(VK_FORMAT_R16G16B16A16_SINT, ISL_FORMAT_R16G16B16A16_SINT), + fmt1(VK_FORMAT_R16G16B16A16_SFLOAT, ISL_FORMAT_R16G16B16A16_FLOAT), + fmt1(VK_FORMAT_R32_UINT, ISL_FORMAT_R32_UINT), + fmt1(VK_FORMAT_R32_SINT, ISL_FORMAT_R32_SINT), + fmt1(VK_FORMAT_R32_SFLOAT, ISL_FORMAT_R32_FLOAT), + fmt1(VK_FORMAT_R32G32_UINT, ISL_FORMAT_R32G32_UINT), + fmt1(VK_FORMAT_R32G32_SINT, ISL_FORMAT_R32G32_SINT), + fmt1(VK_FORMAT_R32G32_SFLOAT, ISL_FORMAT_R32G32_FLOAT), + fmt1(VK_FORMAT_R32G32B32_UINT, ISL_FORMAT_R32G32B32_UINT), + fmt1(VK_FORMAT_R32G32B32_SINT, ISL_FORMAT_R32G32B32_SINT), + fmt1(VK_FORMAT_R32G32B32_SFLOAT, ISL_FORMAT_R32G32B32_FLOAT), + fmt1(VK_FORMAT_R32G32B32A32_UINT, ISL_FORMAT_R32G32B32A32_UINT), + fmt1(VK_FORMAT_R32G32B32A32_SINT, ISL_FORMAT_R32G32B32A32_SINT), + fmt1(VK_FORMAT_R32G32B32A32_SFLOAT, ISL_FORMAT_R32G32B32A32_FLOAT), + fmt1(VK_FORMAT_R64_UINT, ISL_FORMAT_R64_PASSTHRU), + fmt1(VK_FORMAT_R64_SINT, ISL_FORMAT_R64_PASSTHRU), + fmt1(VK_FORMAT_R64_SFLOAT, ISL_FORMAT_R64_PASSTHRU), + fmt1(VK_FORMAT_R64G64_UINT, ISL_FORMAT_R64G64_PASSTHRU), + fmt1(VK_FORMAT_R64G64_SINT, ISL_FORMAT_R64G64_PASSTHRU), + fmt1(VK_FORMAT_R64G64_SFLOAT, ISL_FORMAT_R64G64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64_UINT, ISL_FORMAT_R64G64B64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64_SINT, ISL_FORMAT_R64G64B64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64_SFLOAT, ISL_FORMAT_R64G64B64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64A64_UINT, ISL_FORMAT_R64G64B64A64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64A64_SINT, ISL_FORMAT_R64G64B64A64_PASSTHRU), + fmt1(VK_FORMAT_R64G64B64A64_SFLOAT, ISL_FORMAT_R64G64B64A64_PASSTHRU), + fmt1(VK_FORMAT_B10G11R11_UFLOAT_PACK32, ISL_FORMAT_R11G11B10_FLOAT), + fmt1(VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, ISL_FORMAT_R9G9B9E5_SHAREDEXP), + + d_fmt(VK_FORMAT_D16_UNORM, ISL_FORMAT_R16_UNORM), + d_fmt(VK_FORMAT_X8_D24_UNORM_PACK32, ISL_FORMAT_R24_UNORM_X8_TYPELESS), + d_fmt(VK_FORMAT_D32_SFLOAT, ISL_FORMAT_R32_FLOAT), + s_fmt(VK_FORMAT_S8_UINT, ISL_FORMAT_R8_UINT), + fmt_unsupported(VK_FORMAT_D16_UNORM_S8_UINT), + ds_fmt2(VK_FORMAT_D24_UNORM_S8_UINT, ISL_FORMAT_R24_UNORM_X8_TYPELESS, ISL_FORMAT_R8_UINT), + ds_fmt2(VK_FORMAT_D32_SFLOAT_S8_UINT, ISL_FORMAT_R32_FLOAT, ISL_FORMAT_R8_UINT), + + swiz_fmt1(VK_FORMAT_BC1_RGB_UNORM_BLOCK, ISL_FORMAT_BC1_UNORM, RGB1), + swiz_fmt1(VK_FORMAT_BC1_RGB_SRGB_BLOCK, ISL_FORMAT_BC1_UNORM_SRGB, RGB1), + fmt1(VK_FORMAT_BC1_RGBA_UNORM_BLOCK, ISL_FORMAT_BC1_UNORM), + fmt1(VK_FORMAT_BC1_RGBA_SRGB_BLOCK, ISL_FORMAT_BC1_UNORM_SRGB), + fmt1(VK_FORMAT_BC2_UNORM_BLOCK, ISL_FORMAT_BC2_UNORM), + fmt1(VK_FORMAT_BC2_SRGB_BLOCK, ISL_FORMAT_BC2_UNORM_SRGB), + fmt1(VK_FORMAT_BC3_UNORM_BLOCK, ISL_FORMAT_BC3_UNORM), + fmt1(VK_FORMAT_BC3_SRGB_BLOCK, ISL_FORMAT_BC3_UNORM_SRGB), + fmt1(VK_FORMAT_BC4_UNORM_BLOCK, ISL_FORMAT_BC4_UNORM), + fmt1(VK_FORMAT_BC4_SNORM_BLOCK, ISL_FORMAT_BC4_SNORM), + fmt1(VK_FORMAT_BC5_UNORM_BLOCK, ISL_FORMAT_BC5_UNORM), + fmt1(VK_FORMAT_BC5_SNORM_BLOCK, ISL_FORMAT_BC5_SNORM), + fmt1(VK_FORMAT_BC6H_UFLOAT_BLOCK, ISL_FORMAT_BC6H_UF16), + fmt1(VK_FORMAT_BC6H_SFLOAT_BLOCK, ISL_FORMAT_BC6H_SF16), + fmt1(VK_FORMAT_BC7_UNORM_BLOCK, ISL_FORMAT_BC7_UNORM), + fmt1(VK_FORMAT_BC7_SRGB_BLOCK, ISL_FORMAT_BC7_UNORM_SRGB), + fmt1(VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, ISL_FORMAT_ETC2_RGB8), + fmt1(VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK, ISL_FORMAT_ETC2_SRGB8), + fmt1(VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK, ISL_FORMAT_ETC2_RGB8_PTA), + fmt1(VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK, ISL_FORMAT_ETC2_SRGB8_PTA), + fmt1(VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK, ISL_FORMAT_ETC2_EAC_RGBA8), + fmt1(VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK, ISL_FORMAT_ETC2_EAC_SRGB8_A8), + fmt1(VK_FORMAT_EAC_R11_UNORM_BLOCK, ISL_FORMAT_EAC_R11), + fmt1(VK_FORMAT_EAC_R11_SNORM_BLOCK, ISL_FORMAT_EAC_SIGNED_R11), + fmt1(VK_FORMAT_EAC_R11G11_UNORM_BLOCK, ISL_FORMAT_EAC_RG11), + fmt1(VK_FORMAT_EAC_R11G11_SNORM_BLOCK, ISL_FORMAT_EAC_SIGNED_RG11), + fmt1(VK_FORMAT_ASTC_4x4_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB), + fmt1(VK_FORMAT_ASTC_5x4_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB), + fmt1(VK_FORMAT_ASTC_5x5_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB), + fmt1(VK_FORMAT_ASTC_6x5_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB), + fmt1(VK_FORMAT_ASTC_6x6_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB), + fmt1(VK_FORMAT_ASTC_8x5_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB), + fmt1(VK_FORMAT_ASTC_8x6_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB), + fmt1(VK_FORMAT_ASTC_8x8_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB), + fmt1(VK_FORMAT_ASTC_10x5_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB), + fmt1(VK_FORMAT_ASTC_10x6_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB), + fmt1(VK_FORMAT_ASTC_10x8_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB), + fmt1(VK_FORMAT_ASTC_10x10_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB), + fmt1(VK_FORMAT_ASTC_12x10_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB), + fmt1(VK_FORMAT_ASTC_12x12_SRGB_BLOCK, ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB), + fmt1(VK_FORMAT_ASTC_4x4_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16), + fmt1(VK_FORMAT_ASTC_5x4_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16), + fmt1(VK_FORMAT_ASTC_5x5_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16), + fmt1(VK_FORMAT_ASTC_6x5_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16), + fmt1(VK_FORMAT_ASTC_6x6_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16), + fmt1(VK_FORMAT_ASTC_8x5_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16), + fmt1(VK_FORMAT_ASTC_8x6_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16), + fmt1(VK_FORMAT_ASTC_8x8_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16), + fmt1(VK_FORMAT_ASTC_10x5_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16), + fmt1(VK_FORMAT_ASTC_10x6_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16), + fmt1(VK_FORMAT_ASTC_10x8_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16), + fmt1(VK_FORMAT_ASTC_10x10_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16), + fmt1(VK_FORMAT_ASTC_12x10_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16), + fmt1(VK_FORMAT_ASTC_12x12_UNORM_BLOCK, ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16), + fmt_unsupported(VK_FORMAT_B8G8R8_UNORM), + fmt_unsupported(VK_FORMAT_B8G8R8_SNORM), + fmt_unsupported(VK_FORMAT_B8G8R8_USCALED), + fmt_unsupported(VK_FORMAT_B8G8R8_SSCALED), + fmt_unsupported(VK_FORMAT_B8G8R8_UINT), + fmt_unsupported(VK_FORMAT_B8G8R8_SINT), + fmt_unsupported(VK_FORMAT_B8G8R8_SRGB), + fmt1(VK_FORMAT_B8G8R8A8_UNORM, ISL_FORMAT_B8G8R8A8_UNORM), + fmt_unsupported(VK_FORMAT_B8G8R8A8_SNORM), + fmt_unsupported(VK_FORMAT_B8G8R8A8_USCALED), + fmt_unsupported(VK_FORMAT_B8G8R8A8_SSCALED), + fmt_unsupported(VK_FORMAT_B8G8R8A8_UINT), + fmt_unsupported(VK_FORMAT_B8G8R8A8_SINT), + fmt1(VK_FORMAT_B8G8R8A8_SRGB, ISL_FORMAT_B8G8R8A8_UNORM_SRGB), +}; + +static const struct anv_format _4444_formats[] = { + fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16, ISL_FORMAT_B4G4R4A4_UNORM), + fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16), +}; + +static const struct anv_format ycbcr_formats[] = { + ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1, + y_plane(0, ISL_FORMAT_YCRCB_SWAPUV, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)), + ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1, + y_plane(0, ISL_FORMAT_YCRCB_SWAPUVY, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)), + ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3, + y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2), + chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)), + ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2, + y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)), + ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3, + y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1), + chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)), + ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2, + y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)), + ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3, + y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)), + + fmt_unsupported(VK_FORMAT_R10X6_UNORM_PACK16), + fmt_unsupported(VK_FORMAT_R10X6G10X6_UNORM_2PACK16), + fmt_unsupported(VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_R12X4_UNORM_PACK16), + fmt_unsupported(VK_FORMAT_R12X4G12X4_UNORM_2PACK16), + fmt_unsupported(VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16), + fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16), + fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16), + /* TODO: it is possible to enable the following 2 formats, but that + * requires further refactoring of how we handle multiplanar formats. + */ + fmt_unsupported(VK_FORMAT_G16B16G16R16_422_UNORM), + fmt_unsupported(VK_FORMAT_B16G16R16G16_422_UNORM), + + ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3, + y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2), + chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)), + ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2, + y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)), + ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3, + y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1), + chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)), + ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2, + y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)), + ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3, + y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1), + chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)), +}; + +#undef _fmt +#undef swiz_fmt1 +#undef fmt1 +#undef fmt + +static const struct { + const struct anv_format *formats; + uint32_t n_formats; +} anv_formats[] = { + [0] = { .formats = main_formats, + .n_formats = ARRAY_SIZE(main_formats), }, + [_VK_EXT_4444_formats_number] = { .formats = _4444_formats, + .n_formats = ARRAY_SIZE(_4444_formats), }, + [_VK_KHR_sampler_ycbcr_conversion_number] = { .formats = ycbcr_formats, + .n_formats = ARRAY_SIZE(ycbcr_formats), }, +}; + +const struct anv_format * +anv_get_format(VkFormat vk_format) +{ + uint32_t enum_offset = VK_ENUM_OFFSET(vk_format); + uint32_t ext_number = VK_ENUM_EXTENSION(vk_format); + + if (ext_number >= ARRAY_SIZE(anv_formats) || + enum_offset >= anv_formats[ext_number].n_formats) + return NULL; + + const struct anv_format *format = + &anv_formats[ext_number].formats[enum_offset]; + if (format->planes[0].isl_format == ISL_FORMAT_UNSUPPORTED) + return NULL; + + return format; +} + +/** Return true if any format plane has non-power-of-two bits-per-block. */ +static bool +anv_format_has_npot_plane(const struct anv_format *anv_format) { + for (uint32_t i = 0; i < anv_format->n_planes; ++i) { + const struct isl_format_layout *isl_layout = + isl_format_get_layout(anv_format->planes[i].isl_format); + + if (!util_is_power_of_two_or_zero(isl_layout->bpb)) + return true; + } + + return false; +} + +/** + * Exactly one bit must be set in \a aspect. + * + * If tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, then return the + * requested anv_format_plane without checking for compatibility with modifiers. + * It is the caller's responsibility to verify that the the returned + * anv_format_plane is compatible with a particular modifier. (Observe that + * this function has no parameter for the DRM format modifier, and therefore + * _cannot_ check for compatibility). + */ +struct anv_format_plane +anv_get_format_plane(const struct intel_device_info *devinfo, + VkFormat vk_format, uint32_t plane, + VkImageTiling tiling) +{ + const struct anv_format *format = anv_get_format(vk_format); + const struct anv_format_plane unsupported = { + .isl_format = ISL_FORMAT_UNSUPPORTED, + }; + + if (format == NULL) + return unsupported; + + assert(plane < format->n_planes); + struct anv_format_plane plane_format = format->planes[plane]; + if (plane_format.isl_format == ISL_FORMAT_UNSUPPORTED) + return unsupported; + + if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) + return plane_format; + + if (vk_format_is_depth_or_stencil(vk_format)) + return plane_format; + + const struct isl_format_layout *isl_layout = + isl_format_get_layout(plane_format.isl_format); + + /* On Ivy Bridge we don't even have enough 24 and 48-bit formats that we + * can reliably do texture upload with BLORP so just don't claim support + * for any of them. + */ + if (devinfo->verx10 == 70 && + (isl_layout->bpb == 24 || isl_layout->bpb == 48)) + return unsupported; + + if (tiling == VK_IMAGE_TILING_OPTIMAL && + !util_is_power_of_two_or_zero(isl_layout->bpb)) { + /* Tiled formats *must* be power-of-two because we need up upload + * them with the render pipeline. For 3-channel formats, we fix + * this by switching them over to RGBX or RGBA formats under the + * hood. + */ + enum isl_format rgbx = isl_format_rgb_to_rgbx(plane_format.isl_format); + if (rgbx != ISL_FORMAT_UNSUPPORTED && + isl_format_supports_rendering(devinfo, rgbx)) { + plane_format.isl_format = rgbx; + } else { + plane_format.isl_format = + isl_format_rgb_to_rgba(plane_format.isl_format); + plane_format.swizzle = ISL_SWIZZLE(RED, GREEN, BLUE, ONE); + } + } + + /* The B4G4R4A4 format isn't available prior to Broadwell so we have to fall + * back to a format with a more complex swizzle. + */ + if (vk_format == VK_FORMAT_B4G4R4A4_UNORM_PACK16 && devinfo->ver < 8) { + plane_format.isl_format = ISL_FORMAT_B4G4R4A4_UNORM; + plane_format.swizzle = ISL_SWIZZLE(GREEN, RED, ALPHA, BLUE); + } + + return plane_format; +} + +struct anv_format_plane +anv_get_format_aspect(const struct intel_device_info *devinfo, + VkFormat vk_format, + VkImageAspectFlagBits aspect, VkImageTiling tiling) +{ + const uint32_t plane = + anv_aspect_to_plane(vk_format_aspects(vk_format), aspect); + return anv_get_format_plane(devinfo, vk_format, plane, tiling); +} + +// Format capabilities + +VkFormatFeatureFlags2 +anv_get_image_format_features2(const struct intel_device_info *devinfo, + VkFormat vk_format, + const struct anv_format *anv_format, + VkImageTiling vk_tiling, + const struct isl_drm_modifier_info *isl_mod_info) +{ + VkFormatFeatureFlags2 flags = 0; + + if (anv_format == NULL) + return 0; + + assert((isl_mod_info != NULL) == + (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)); + + const VkImageAspectFlags aspects = vk_format_aspects(vk_format); + + if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (vk_tiling == VK_IMAGE_TILING_LINEAR || + vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) + return 0; + + flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + + if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && devinfo->ver >= 9) + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT; + + if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT; + + return flags; + } + + assert(aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + const struct anv_format_plane plane_format = + anv_get_format_plane(devinfo, vk_format, 0, vk_tiling); + + if (plane_format.isl_format == ISL_FORMAT_UNSUPPORTED) + return 0; + + struct anv_format_plane base_plane_format = plane_format; + if (vk_tiling != VK_IMAGE_TILING_LINEAR) { + base_plane_format = anv_get_format_plane(devinfo, vk_format, 0, + VK_IMAGE_TILING_LINEAR); + } + + enum isl_format base_isl_format = base_plane_format.isl_format; + + if (isl_format_supports_sampling(devinfo, plane_format.isl_format)) { + /* ASTC textures must be in Y-tiled memory, and we reject compressed + * formats with modifiers. We do however interpret ASTC textures with + * uncompressed formats during data transfers. + */ + if (vk_tiling != VK_IMAGE_TILING_OPTIMAL && + isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC) + return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT; + + if (devinfo->ver >= 9) + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT; + + if (isl_format_supports_filtering(devinfo, plane_format.isl_format)) + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT; + } + + /* We can render to swizzled formats. However, if the alpha channel is + * moved, then blending won't work correctly. The PRM tells us + * straight-up not to render to such a surface. + */ + if (isl_format_supports_rendering(devinfo, plane_format.isl_format) && + plane_format.swizzle.a == ISL_CHANNEL_SELECT_ALPHA) { + flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT; + + /* While we can render to swizzled formats, they don't blend correctly + * if there are blend constants involved. The swizzle just remaps the + * output of the shader to different channels in the texture. It + * doesn't change the interpretation of the constant blend factors in + * COLOR_CALC_STATE. + */ + if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format) && + isl_swizzle_is_identity(plane_format.swizzle)) + flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT; + } + + /* Load/store is determined based on base format. This prevents RGB + * formats from showing up as load/store capable. + */ + if (isl_format_supports_typed_reads(devinfo, base_isl_format)) + flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT; + if (isl_format_supports_typed_writes(devinfo, base_isl_format)) + flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT; + + /* Keep this old behavior on VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT. + * When KHR_format_features2 is enabled, applications should only rely on + * it for the list of shader storage extended formats [1]. Before that, + * this applies to all VkFormats. + * + * [1] : https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#features-shaderStorageImageExtendedFormats + */ + if (flags & VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT) + flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT; + + if (base_isl_format == ISL_FORMAT_R32_SINT || + base_isl_format == ISL_FORMAT_R32_UINT || + base_isl_format == ISL_FORMAT_R32_FLOAT) + flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT; + + if (flags) { + flags |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + + /* Blit destination requires rendering support. */ + if (isl_format_supports_rendering(devinfo, plane_format.isl_format)) + flags |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT; + } + + /* XXX: We handle 3-channel formats by switching them out for RGBX or + * RGBA formats behind-the-scenes. This works fine for textures + * because the upload process will fill in the extra channel. + * We could also support it for render targets, but it will take + * substantially more work and we have enough RGBX formats to handle + * what most clients will want. + */ + if (vk_tiling == VK_IMAGE_TILING_OPTIMAL && + base_isl_format != ISL_FORMAT_UNSUPPORTED && + !util_is_power_of_two_or_zero(isl_format_layouts[base_isl_format].bpb) && + isl_format_rgb_to_rgbx(base_isl_format) == ISL_FORMAT_UNSUPPORTED) { + flags &= ~VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT; + flags &= ~VK_FORMAT_FEATURE_2_BLIT_DST_BIT; + } + + if (anv_format->can_ycbcr) { + /* The sampler doesn't have support for mid point when it handles YUV on + * its own. + */ + if (isl_format_is_yuv(anv_format->planes[0].isl_format)) { + /* TODO: We've disabled linear implicit reconstruction with the + * sampler. The failures show a slightly out of range values on the + * bottom left of the sampled image. + */ + flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT; + } else { + flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT | + VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT | + VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT; + } + + /* We can support cosited chroma locations when handle planes with our + * own shader snippets. + */ + for (unsigned p = 0; p < anv_format->n_planes; p++) { + if (anv_format->planes[p].denominator_scales[0] > 1 || + anv_format->planes[p].denominator_scales[1] > 1) { + flags |= VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT; + break; + } + } + + if (anv_format->n_planes > 1) + flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT; + + const VkFormatFeatureFlags2 disallowed_ycbcr_image_features = + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT | + VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT; + + flags &= ~disallowed_ycbcr_image_features; + } + + if (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + if (!isl_drm_modifier_get_score(devinfo, isl_mod_info->modifier)) + return 0; + + /* Try to restrict the supported formats to those in drm_fourcc.h. The + * VK_EXT_image_drm_format_modifier does not require this (after all, two + * Vulkan apps could share an image by exchanging its VkFormat instead of + * a DRM_FORMAT), but there exist no users of such non-drm_fourcc formats + * yet. And the restriction shrinks our test surface. + */ + const struct isl_format_layout *isl_layout = + isl_format_get_layout(plane_format.isl_format); + + switch (isl_layout->colorspace) { + case ISL_COLORSPACE_LINEAR: + case ISL_COLORSPACE_SRGB: + /* Each DRM_FORMAT that we support uses unorm (if the DRM format name + * has no type suffix) or sfloat (if it has suffix F). No format + * contains mixed types. (as of 2021-06-14) + */ + if (isl_layout->uniform_channel_type != ISL_UNORM && + isl_layout->uniform_channel_type != ISL_SFLOAT) + return 0; + break; + case ISL_COLORSPACE_YUV: + anv_finishme("support YUV colorspace with DRM format modifiers"); + return 0; + case ISL_COLORSPACE_NONE: + return 0; + } + + /* We could support compressed formats if we wanted to. */ + if (isl_format_is_compressed(plane_format.isl_format)) + return 0; + + /* No non-power-of-two fourcc formats exist. + * + * Even if non-power-of-two fourcc formats existed, we could support them + * only with DRM_FORMAT_MOD_LINEAR. Tiled formats must be power-of-two + * because we implement transfers with the render pipeline. + */ + if (anv_format_has_npot_plane(anv_format)) + return 0; + + if (anv_format->n_planes > 1) { + /* For simplicity, keep DISJOINT disabled for multi-planar format. */ + flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT; + + /* VK_ANDROID_external_memory_android_hardware_buffer in Virtio-GPU + * Venus driver layers on top of VK_EXT_image_drm_format_modifier of + * the host Vulkan driver, and both VK_FORMAT_G8_B8R8_2PLANE_420_UNORM + * and VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM and required to support + * camera/media interop in Android. + */ + if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM && + vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) { + anv_finishme("support more multi-planar formats with DRM modifiers"); + return 0; + } + + /* Currently there is no way to properly map memory planes to format + * planes and aux planes due to the lack of defined ABI for external + * multi-planar images. + */ + if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + return 0; + } + } + + if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E && + !isl_format_supports_ccs_e(devinfo, plane_format.isl_format)) { + return 0; + } + + if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + /* Rejection DISJOINT for consistency with the GL driver. In + * eglCreateImage, we require that the dma_buf for the primary surface + * and the dma_buf for its aux surface refer to the same bo. + */ + flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT; + + /* When the hardware accesses a storage image, it bypasses the aux + * surface. We could support storage access on images with aux + * modifiers by resolving the aux surface prior to the storage access. + */ + flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT; + flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT; + } + } + + if (devinfo->has_coarse_pixel_primitive_and_cb && + vk_format == VK_FORMAT_R8_UINT && + vk_tiling == VK_IMAGE_TILING_OPTIMAL) + flags |= VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR; + + return flags; +} + +static VkFormatFeatureFlags2 +get_buffer_format_features2(const struct intel_device_info *devinfo, + VkFormat vk_format, + const struct anv_format *anv_format) +{ + VkFormatFeatureFlags2 flags = 0; + + if (anv_format == NULL) + return 0; + + const enum isl_format isl_format = anv_format->planes[0].isl_format; + + if (isl_format == ISL_FORMAT_UNSUPPORTED) + return 0; + + if (anv_format->n_planes > 1) + return 0; + + if (anv_format->can_ycbcr) + return 0; + + if (vk_format_is_depth_or_stencil(vk_format)) + return 0; + + if (isl_format_supports_sampling(devinfo, isl_format) && + !isl_format_is_compressed(isl_format)) + flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT; + + if (isl_format_supports_vertex_fetch(devinfo, isl_format)) + flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT; + + if (isl_is_storage_image_format(isl_format)) + flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT; + + if (isl_format == ISL_FORMAT_R32_SINT || isl_format == ISL_FORMAT_R32_UINT) + flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT; + + return flags; +} + +static VkFormatFeatureFlags +features2_to_features(VkFormatFeatureFlags2 features2) +{ + return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS; +} + +static void +get_drm_format_modifier_properties_list(const struct anv_physical_device *physical_device, + VkFormat vk_format, + VkDrmFormatModifierPropertiesListEXT *list) +{ + const struct intel_device_info *devinfo = &physical_device->info; + const struct anv_format *anv_format = anv_get_format(vk_format); + + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + + isl_drm_modifier_info_for_each(isl_mod_info) { + VkFormatFeatureFlags2 features2 = + anv_get_image_format_features2(devinfo, vk_format, anv_format, + VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, + isl_mod_info); + VkFormatFeatureFlags features = features2_to_features(features2); + if (!features) + continue; + + uint32_t planes = anv_format->n_planes; + if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) + ++planes; + + vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, out_props) { + *out_props = (VkDrmFormatModifierPropertiesEXT) { + .drmFormatModifier = isl_mod_info->modifier, + .drmFormatModifierPlaneCount = planes, + .drmFormatModifierTilingFeatures = features, + }; + }; + } +} + +static void +get_drm_format_modifier_properties_list_2(const struct anv_physical_device *physical_device, + VkFormat vk_format, + VkDrmFormatModifierPropertiesList2EXT *list) +{ + const struct intel_device_info *devinfo = &physical_device->info; + const struct anv_format *anv_format = anv_get_format(vk_format); + + VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out, + list->pDrmFormatModifierProperties, + &list->drmFormatModifierCount); + + isl_drm_modifier_info_for_each(isl_mod_info) { + VkFormatFeatureFlags2 features2 = + anv_get_image_format_features2(devinfo, vk_format, anv_format, + VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, + isl_mod_info); + if (!features2) + continue; + + uint32_t planes = anv_format->n_planes; + if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) + ++planes; + + vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out, out_props) { + *out_props = (VkDrmFormatModifierProperties2EXT) { + .drmFormatModifier = isl_mod_info->modifier, + .drmFormatModifierPlaneCount = planes, + .drmFormatModifierTilingFeatures = features2, + }; + }; + } +} + +void anv_GetPhysicalDeviceFormatProperties2( + VkPhysicalDevice physicalDevice, + VkFormat vk_format, + VkFormatProperties2* pFormatProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + const struct intel_device_info *devinfo = &physical_device->info; + const struct anv_format *anv_format = anv_get_format(vk_format); + + assert(pFormatProperties->sType == VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2); + + VkFormatFeatureFlags2 linear2, optimal2, buffer2; + linear2 = anv_get_image_format_features2(devinfo, vk_format, anv_format, + VK_IMAGE_TILING_LINEAR, NULL); + optimal2 = anv_get_image_format_features2(devinfo, vk_format, anv_format, + VK_IMAGE_TILING_OPTIMAL, NULL); + buffer2 = get_buffer_format_features2(devinfo, vk_format, anv_format); + + pFormatProperties->formatProperties = (VkFormatProperties) { + .linearTilingFeatures = features2_to_features(linear2), + .optimalTilingFeatures = features2_to_features(optimal2), + .bufferFeatures = features2_to_features(buffer2), + }; + + vk_foreach_struct(ext, pFormatProperties->pNext) { + /* Use unsigned since some cases are not in the VkStructureType enum. */ + switch ((unsigned)ext->sType) { + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT: + get_drm_format_modifier_properties_list(physical_device, vk_format, + (void *)ext); + break; + + case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT: + get_drm_format_modifier_properties_list_2(physical_device, vk_format, + (void *)ext); + break; + + case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: { + VkFormatProperties3 *props = (VkFormatProperties3 *)ext; + props->linearTilingFeatures = linear2; + props->optimalTilingFeatures = optimal2; + props->bufferFeatures = buffer2; + break; + } + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } +} + +static VkResult +anv_get_image_format_properties( + struct anv_physical_device *physical_device, + const VkPhysicalDeviceImageFormatInfo2 *info, + VkImageFormatProperties *pImageFormatProperties, + VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties) +{ + VkFormatFeatureFlags2 format_feature_flags; + VkExtent3D maxExtent; + uint32_t maxMipLevels; + uint32_t maxArraySize; + VkSampleCountFlags sampleCounts; + const struct intel_device_info *devinfo = &physical_device->info; + const struct anv_format *format = anv_get_format(info->format); + const struct isl_drm_modifier_info *isl_mod_info = NULL; + const VkImageFormatListCreateInfo *format_list_info = + vk_find_struct_const(info->pNext, IMAGE_FORMAT_LIST_CREATE_INFO); + + if (format == NULL) + goto unsupported; + + if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *vk_mod_info = + vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT); + + isl_mod_info = isl_drm_modifier_get_info(vk_mod_info->drmFormatModifier); + if (isl_mod_info == NULL) + goto unsupported; + } + + assert(format->vk_format == info->format); + format_feature_flags = anv_get_image_format_features2(devinfo, info->format, + format, info->tiling, + isl_mod_info); + + /* Remove the VkFormatFeatureFlags that are incompatible with any declared + * image view format. (Removals are more likely to occur when a DRM format + * modifier is present). + */ + if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) && format_list_info) { + for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) { + VkFormat vk_view_format = format_list_info->pViewFormats[i]; + const struct anv_format *anv_view_format = anv_get_format(vk_view_format); + VkFormatFeatureFlags2 view_format_features = + anv_get_image_format_features2(devinfo, vk_view_format, + anv_view_format, + info->tiling, + isl_mod_info); + format_feature_flags &= view_format_features; + } + } + + if (!format_feature_flags) + goto unsupported; + + switch (info->type) { + default: + unreachable("bad VkImageType"); + case VK_IMAGE_TYPE_1D: + maxExtent.width = 16384; + maxExtent.height = 1; + maxExtent.depth = 1; + maxMipLevels = 15; /* log2(maxWidth) + 1 */ + maxArraySize = 2048; + sampleCounts = VK_SAMPLE_COUNT_1_BIT; + break; + case VK_IMAGE_TYPE_2D: + /* FINISHME: Does this really differ for cube maps? The documentation + * for RENDER_SURFACE_STATE suggests so. + */ + maxExtent.width = 16384; + maxExtent.height = 16384; + maxExtent.depth = 1; + maxMipLevels = 15; /* log2(maxWidth) + 1 */ + maxArraySize = 2048; + sampleCounts = VK_SAMPLE_COUNT_1_BIT; + break; + case VK_IMAGE_TYPE_3D: + maxExtent.width = 2048; + maxExtent.height = 2048; + maxExtent.depth = 2048; + /* Prior to SKL, the mipmaps for 3D surfaces are laid out in a way + * that make it impossible to represent in the way that + * VkSubresourceLayout expects. Since we can't tell users how to make + * sense of them, don't report them as available. + */ + if (devinfo->ver < 9 && info->tiling == VK_IMAGE_TILING_LINEAR) + maxMipLevels = 1; + else + maxMipLevels = 12; /* log2(maxWidth) + 1 */ + maxArraySize = 1; + sampleCounts = VK_SAMPLE_COUNT_1_BIT; + break; + } + + /* From the Vulkan 1.2.199 spec: + * + * "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be + * created with usage flags that are not supported for the format the + * image is created with but are supported for at least one format a + * VkImageView created from the image can have." + * + * If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set, views can be created with + * different usage than the image so we can't always filter on usage. + * There is one exception to this below for storage. + */ + const VkImageUsageFlags image_usage = info->usage; + VkImageUsageFlags view_usage = image_usage; + if (info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) + view_usage = 0; + + if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + /* We support modifiers only for "simple" (that is, non-array + * non-mipmapped single-sample) 2D images. + */ + if (info->type != VK_IMAGE_TYPE_2D) { + vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT " + "requires VK_IMAGE_TYPE_2D"); + goto unsupported; + } + + maxArraySize = 1; + maxMipLevels = 1; + sampleCounts = VK_SAMPLE_COUNT_1_BIT; + + if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E && + !anv_formats_ccs_e_compatible(devinfo, info->flags, info->format, + info->tiling, image_usage, + format_list_info)) { + goto unsupported; + } + } + + /* Our hardware doesn't support 1D compressed textures. + * From the SKL PRM, RENDER_SURFACE_STATE::SurfaceFormat: + * * This field cannot be a compressed (BC*, DXT*, FXT*, ETC*, EAC*) format + * if the Surface Type is SURFTYPE_1D. + * * This field cannot be ASTC format if the Surface Type is SURFTYPE_1D. + */ + if (info->type == VK_IMAGE_TYPE_1D && + isl_format_is_compressed(format->planes[0].isl_format)) { + goto unsupported; + } + + if (info->tiling == VK_IMAGE_TILING_OPTIMAL && + info->type == VK_IMAGE_TYPE_2D && + (format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) && + !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) && + !(image_usage & VK_IMAGE_USAGE_STORAGE_BIT) && + isl_format_supports_multisampling(devinfo, format->planes[0].isl_format)) { + sampleCounts = isl_device_get_sample_counts(&physical_device->isl_dev); + } + + if (view_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) { + if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | + VK_FORMAT_FEATURE_2_BLIT_SRC_BIT))) { + goto unsupported; + } + } + + if (view_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) { + if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | + VK_FORMAT_FEATURE_2_BLIT_DST_BIT))) { + goto unsupported; + } + } + + if (view_usage & VK_IMAGE_USAGE_SAMPLED_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) { + goto unsupported; + } + } + + if (image_usage & VK_IMAGE_USAGE_STORAGE_BIT) { + /* Non-power-of-two formats can never be used as storage images. We + * only check plane 0 because there are no YCbCr formats with + * non-power-of-two planes. + */ + const struct isl_format_layout *isl_layout = + isl_format_get_layout(format->planes[0].isl_format); + if (!util_is_power_of_two_or_zero(isl_layout->bpb)) + goto unsupported; + } + + if (view_usage & VK_IMAGE_USAGE_STORAGE_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) { + goto unsupported; + } + } + + if (view_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) { + goto unsupported; + } + } + + if (view_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + if (!(format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) { + goto unsupported; + } + } + + if (info->flags & VK_IMAGE_CREATE_DISJOINT_BIT) { + /* From the Vulkan 1.2.149 spec, VkImageCreateInfo: + * + * If format is a multi-planar format, and if imageCreateFormatFeatures + * (as defined in Image Creation Limits) does not contain + * VK_FORMAT_FEATURE_2_DISJOINT_BIT, then flags must not contain + * VK_IMAGE_CREATE_DISJOINT_BIT. + */ + if (format->n_planes > 1 && + !(format_feature_flags & VK_FORMAT_FEATURE_2_DISJOINT_BIT)) { + goto unsupported; + } + + /* From the Vulkan 1.2.149 spec, VkImageCreateInfo: + * + * If format is not a multi-planar format, and flags does not include + * VK_IMAGE_CREATE_ALIAS_BIT, flags must not contain + * VK_IMAGE_CREATE_DISJOINT_BIT. + */ + if (format->n_planes == 1 && + !(info->flags & VK_IMAGE_CREATE_ALIAS_BIT)) { + goto unsupported; + } + + if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT && + isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + /* Rejection DISJOINT for consistency with the GL driver. In + * eglCreateImage, we require that the dma_buf for the primary surface + * and the dma_buf for its aux surface refer to the same bo. + */ + goto unsupported; + } + } + + if (info->flags & VK_IMAGE_CREATE_ALIAS_BIT) { + /* Reject aliasing of images with non-linear DRM format modifiers because: + * + * 1. For modifiers with compression, we store aux tracking state in + * ANV_IMAGE_MEMORY_BINDING_PRIVATE, which is not aliasable because it's + * not client-bound. + * + * 2. For tiled modifiers without compression, we may attempt to compress + * them behind the scenes, in which case both the aux tracking state + * and the CCS data are bound to ANV_IMAGE_MEMORY_BINDING_PRIVATE. + */ + if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT && + isl_mod_info->modifier != DRM_FORMAT_MOD_LINEAR) { + goto unsupported; + } + } + + if (image_usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) { + /* Nothing to check. */ + } + + if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) { + /* Ignore this flag because it was removed from the + * provisional_I_20150910 header. + */ + } + + /* From the bspec section entitled "Surface Layout and Tiling", + * pre-gfx9 has a 2 GB limitation of the size in bytes, + * gfx9 and gfx10 have a 256 GB limitation and gfx11+ + * has a 16 TB limitation. + */ + uint64_t maxResourceSize = 0; + if (devinfo->ver < 9) + maxResourceSize = (uint64_t) 1 << 31; + else if (devinfo->ver < 11) + maxResourceSize = (uint64_t) 1 << 38; + else + maxResourceSize = (uint64_t) 1 << 44; + + *pImageFormatProperties = (VkImageFormatProperties) { + .maxExtent = maxExtent, + .maxMipLevels = maxMipLevels, + .maxArrayLayers = maxArraySize, + .sampleCounts = sampleCounts, + + /* FINISHME: Accurately calculate + * VkImageFormatProperties::maxResourceSize. + */ + .maxResourceSize = maxResourceSize, + }; + + if (pYcbcrImageFormatProperties) { + pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount = + format->n_planes; + } + + return VK_SUCCESS; + +unsupported: + *pImageFormatProperties = (VkImageFormatProperties) { + .maxExtent = { 0, 0, 0 }, + .maxMipLevels = 0, + .maxArrayLayers = 0, + .sampleCounts = 0, + .maxResourceSize = 0, + }; + + return VK_ERROR_FORMAT_NOT_SUPPORTED; +} + +VkResult anv_GetPhysicalDeviceImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkImageCreateFlags createFlags, + VkImageFormatProperties* pImageFormatProperties) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + + const VkPhysicalDeviceImageFormatInfo2 info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = NULL, + .format = format, + .type = type, + .tiling = tiling, + .usage = usage, + .flags = createFlags, + }; + + return anv_get_image_format_properties(physical_device, &info, + pImageFormatProperties, NULL); +} + + +/* Supports opaque fd but not dma_buf. */ +static const VkExternalMemoryProperties opaque_fd_only_props = { + .externalMemoryFeatures = + VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + .compatibleHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, +}; + +/* Supports opaque fd and dma_buf. */ +static const VkExternalMemoryProperties opaque_fd_dma_buf_props = { + .externalMemoryFeatures = + VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT | + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT, + .compatibleHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT | + VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT, +}; + +static const VkExternalMemoryProperties userptr_props = { + .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = 0, + .compatibleHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, +}; + +static const VkExternalMemoryProperties android_buffer_props = { + .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, + .compatibleHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, +}; + + +static const VkExternalMemoryProperties android_image_props = { + .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT | + VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT, + .exportFromImportedHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, + .compatibleHandleTypes = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID, +}; + +VkResult anv_GetPhysicalDeviceImageFormatProperties2( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceImageFormatInfo2* base_info, + VkImageFormatProperties2* base_props) +{ + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL; + VkExternalImageFormatProperties *external_props = NULL; + VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL; + VkAndroidHardwareBufferUsageANDROID *android_usage = NULL; + VkResult result; + + /* Extract input structs */ + vk_foreach_struct_const(s, base_info->pNext) { + switch (s->sType) { + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO: + external_info = (const void *) s; + break; + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT: + case VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO: + /* anv_get_image_format_properties will handle these */ + break; + case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO: + /* Ignore but don't warn */ + break; + default: + anv_debug_ignored_stype(s->sType); + break; + } + } + + /* Extract output structs */ + vk_foreach_struct(s, base_props->pNext) { + switch (s->sType) { + case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: + external_props = (void *) s; + break; + case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: + ycbcr_props = (void *) s; + break; + case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID: + android_usage = (void *) s; + break; + default: + anv_debug_ignored_stype(s->sType); + break; + } + } + + result = anv_get_image_format_properties(physical_device, base_info, + &base_props->imageFormatProperties, ycbcr_props); + if (result != VK_SUCCESS) + goto fail; + + bool ahw_supported = + physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer; + + if (ahw_supported && android_usage) { + android_usage->androidHardwareBufferUsage = + anv_ahw_usage_from_vk_usage(base_info->flags, + base_info->usage); + + /* Limit maxArrayLayers to 1 for AHardwareBuffer based images for now. */ + base_props->imageFormatProperties.maxArrayLayers = 1; + } + + /* From the Vulkan 1.0.42 spec: + * + * If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2 will + * behave as if VkPhysicalDeviceExternalImageFormatInfo was not + * present and VkExternalImageFormatProperties will be ignored. + */ + if (external_info && external_info->handleType != 0) { + /* Does there exist a method for app and driver to explicitly communicate + * to each other the image's memory layout? + */ + bool tiling_has_explicit_layout; + + switch (base_info->tiling) { + default: + unreachable("bad VkImageTiling"); + case VK_IMAGE_TILING_LINEAR: + /* The app can query the image's memory layout with + * vkGetImageSubresourceLayout. + */ + tiling_has_explicit_layout = true; + break; + case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: + /* The app can provide the image's memory layout with + * VkImageDrmFormatModifierExplicitCreateInfoEXT; + * or the app can query it with vkGetImageSubresourceLayout. + */ + tiling_has_explicit_layout = true; + break; + case VK_IMAGE_TILING_OPTIMAL: + /* The app can neither query nor provide the image's memory layout. */ + tiling_has_explicit_layout = false; + break; + } + + /* Compatibility between tiling and external memory handles + * -------------------------------------------------------- + * When importing or exporting an image, there must exist a method that + * enables the app and driver to agree on the image's memory layout. If no + * method exists, then we reject image creation here. + * + * If the memory handle requires matching + * VkPhysicalDeviceIDProperties::driverUUID and ::deviceUUID, then the + * match-requirement guarantees that all users of the image agree on the + * image's memory layout. + * + * If the memory handle does not require matching + * VkPhysicalDeviceIDProperties::driverUUID nor ::deviceUUID, then we + * require that the app and driver be able to explicitly communicate to + * each other the image's memory layout. + * + * (For restrictions on driverUUID and deviceUUID, see the Vulkan 1.2.149 + * spec, Table 73 "External memory handle types"). + */ + switch (external_info->handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + if (external_props) { + if (tiling_has_explicit_layout) { + /* With an explicit memory layout, we don't care which type of fd + * the image belongs too. Both OPAQUE_FD and DMA_BUF are + * interchangeable here. + */ + external_props->externalMemoryProperties = opaque_fd_dma_buf_props; + } else { + /* With an implicit memory layout, we must rely on deviceUUID + * and driverUUID to determine the layout. Therefore DMA_BUF is + * incompatible here. + */ + external_props->externalMemoryProperties = opaque_fd_only_props; + } + } + break; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + /* This memory handle has no restrictions on driverUUID nor deviceUUID, + * and therefore requires explicit memory layout. + */ + if (!tiling_has_explicit_layout) { + result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT " + "requires VK_IMAGE_TILING_LINEAR or " + "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT"); + goto fail; + } + + /* With an explicit memory layout, we don't care which type of fd + * the image belongs too. Both OPAQUE_FD and DMA_BUF are + * interchangeable here. + */ + if (external_props) + external_props->externalMemoryProperties = opaque_fd_dma_buf_props; + break; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: + /* This memory handle has no restrictions on driverUUID nor deviceUUID, + * and therefore requires explicit memory layout. + */ + if (!tiling_has_explicit_layout) { + result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT " + "requires VK_IMAGE_TILING_LINEAR or " + "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT"); + goto fail; + } + + if (external_props) + external_props->externalMemoryProperties = userptr_props; + break; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID: + /* This memory handle is magic. The Vulkan spec says it has no + * requirements regarding deviceUUID nor driverUUID, but Android still + * requires support for VK_IMAGE_TILING_OPTIMAL. Android systems + * communicate the image's memory layout through backdoor channels. + */ + if (ahw_supported && external_props) { + external_props->externalMemoryProperties = android_image_props; + break; + } + FALLTHROUGH; /* If ahw not supported */ + default: + /* From the Vulkan 1.0.42 spec: + * + * If handleType is not compatible with the [parameters] specified + * in VkPhysicalDeviceImageFormatInfo2, then + * vkGetPhysicalDeviceImageFormatProperties2 returns + * VK_ERROR_FORMAT_NOT_SUPPORTED. + */ + result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED, + "unsupported VkExternalMemoryTypeFlagBits 0x%x", + external_info->handleType); + goto fail; + } + } + + return VK_SUCCESS; + + fail: + if (result == VK_ERROR_FORMAT_NOT_SUPPORTED) { + /* From the Vulkan 1.0.42 spec: + * + * If the combination of parameters to + * vkGetPhysicalDeviceImageFormatProperties2 is not supported by + * the implementation for use in vkCreateImage, then all members of + * imageFormatProperties will be filled with zero. + */ + base_props->imageFormatProperties = (VkImageFormatProperties) {}; + } + + return result; +} + +void anv_GetPhysicalDeviceSparseImageFormatProperties( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + uint32_t samples, + VkImageUsageFlags usage, + VkImageTiling tiling, + uint32_t* pNumProperties, + VkSparseImageFormatProperties* pProperties) +{ + /* Sparse images are not yet supported. */ + *pNumProperties = 0; +} + +void anv_GetPhysicalDeviceSparseImageFormatProperties2( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSparseImageFormatInfo2* pFormatInfo, + uint32_t* pPropertyCount, + VkSparseImageFormatProperties2* pProperties) +{ + /* Sparse images are not yet supported. */ + *pPropertyCount = 0; +} + +void anv_GetPhysicalDeviceExternalBufferProperties( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalBufferInfo* pExternalBufferInfo, + VkExternalBufferProperties* pExternalBufferProperties) +{ + /* The Vulkan 1.0.42 spec says "handleType must be a valid + * VkExternalMemoryHandleTypeFlagBits value" in + * VkPhysicalDeviceExternalBufferInfo. This differs from + * VkPhysicalDeviceExternalImageFormatInfo, which surprisingly permits + * handleType == 0. + */ + assert(pExternalBufferInfo->handleType != 0); + + /* All of the current flags are for sparse which we don't support yet. + * Even when we do support it, doing sparse on external memory sounds + * sketchy. Also, just disallowing flags is the safe option. + */ + if (pExternalBufferInfo->flags) + goto unsupported; + + ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice); + + switch (pExternalBufferInfo->handleType) { + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: + pExternalBufferProperties->externalMemoryProperties = opaque_fd_dma_buf_props; + return; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: + pExternalBufferProperties->externalMemoryProperties = userptr_props; + return; + case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID: + if (physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer) { + pExternalBufferProperties->externalMemoryProperties = android_buffer_props; + return; + } + FALLTHROUGH; /* If ahw not supported */ + default: + goto unsupported; + } + + unsupported: + /* From the Vulkan 1.1.113 spec: + * + * compatibleHandleTypes must include at least handleType. + */ + pExternalBufferProperties->externalMemoryProperties = + (VkExternalMemoryProperties) { + .compatibleHandleTypes = pExternalBufferInfo->handleType, + }; +} + +VkResult anv_CreateSamplerYcbcrConversion( + VkDevice _device, + const VkSamplerYcbcrConversionCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSamplerYcbcrConversion* pYcbcrConversion) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_ycbcr_conversion *conversion; + + /* Search for VkExternalFormatANDROID and resolve the format. */ + struct anv_format *ext_format = NULL; + const VkExternalFormatANDROID *ext_info = + vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID); + + uint64_t format = ext_info ? ext_info->externalFormat : 0; + if (format) { + assert(pCreateInfo->format == VK_FORMAT_UNDEFINED); + ext_format = (struct anv_format *) (uintptr_t) format; + } + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO); + + conversion = vk_object_zalloc(&device->vk, pAllocator, sizeof(*conversion), + VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION); + if (!conversion) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + conversion->format = anv_get_format(pCreateInfo->format); + conversion->ycbcr_model = pCreateInfo->ycbcrModel; + conversion->ycbcr_range = pCreateInfo->ycbcrRange; + + /* The Vulkan 1.1.95 spec says "When creating an external format conversion, + * the value of components if ignored." + */ + if (!ext_format) { + conversion->mapping[0] = pCreateInfo->components.r; + conversion->mapping[1] = pCreateInfo->components.g; + conversion->mapping[2] = pCreateInfo->components.b; + conversion->mapping[3] = pCreateInfo->components.a; + } + + conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset; + conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset; + conversion->chroma_filter = pCreateInfo->chromaFilter; + + /* Setup external format. */ + if (ext_format) + conversion->format = ext_format; + + bool has_chroma_subsampled = false; + for (uint32_t p = 0; p < conversion->format->n_planes; p++) { + if (conversion->format->planes[p].has_chroma && + (conversion->format->planes[p].denominator_scales[0] > 1 || + conversion->format->planes[p].denominator_scales[1] > 1)) + has_chroma_subsampled = true; + } + conversion->chroma_reconstruction = has_chroma_subsampled && + (conversion->chroma_offsets[0] == VK_CHROMA_LOCATION_COSITED_EVEN || + conversion->chroma_offsets[1] == VK_CHROMA_LOCATION_COSITED_EVEN); + + *pYcbcrConversion = anv_ycbcr_conversion_to_handle(conversion); + + return VK_SUCCESS; +} + +void anv_DestroySamplerYcbcrConversion( + VkDevice _device, + VkSamplerYcbcrConversion YcbcrConversion, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, YcbcrConversion); + + if (!conversion) + return; + + vk_object_free(&device->vk, pAllocator, conversion); +} diff --git a/src/intel/vulkan_hasvk/anv_gem.c b/src/intel/vulkan_hasvk/anv_gem.c new file mode 100644 index 00000000000..d69ebe424ca --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_gem.c @@ -0,0 +1,405 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "anv_private.h" +#include "common/intel_defines.h" +#include "common/intel_gem.h" + +/** + * Wrapper around DRM_IOCTL_I915_GEM_CREATE. + * + * Return gem handle, or 0 on failure. Gem handles are never 0. + */ +uint32_t +anv_gem_create(struct anv_device *device, uint64_t size) +{ + struct drm_i915_gem_create gem_create = { + .size = size, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create); + if (ret != 0) { + /* FIXME: What do we do if this fails? */ + return 0; + } + + return gem_create.handle; +} + +void +anv_gem_close(struct anv_device *device, uint32_t gem_handle) +{ + struct drm_gem_close close = { + .handle = gem_handle, + }; + + intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close); +} + +uint32_t +anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size, + uint32_t flags, uint32_t num_regions, + struct drm_i915_gem_memory_class_instance *regions) +{ + /* Check for invalid flags */ + assert((flags & ~I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) == 0); + + struct drm_i915_gem_create_ext_memory_regions ext_regions = { + .base = { .name = I915_GEM_CREATE_EXT_MEMORY_REGIONS }, + .num_regions = num_regions, + .regions = (uintptr_t)regions, + }; + + struct drm_i915_gem_create_ext gem_create = { + .size = anv_bo_size, + .extensions = (uintptr_t) &ext_regions, + .flags = flags, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT, + &gem_create); + if (ret != 0) { + return 0; + } + + return gem_create.handle; +} + +/** + * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error. + */ +static void* +anv_gem_mmap_offset(struct anv_device *device, uint32_t gem_handle, + uint64_t offset, uint64_t size, uint32_t flags) +{ + struct drm_i915_gem_mmap_offset gem_mmap = { + .handle = gem_handle, + .flags = device->info->has_local_mem ? I915_MMAP_OFFSET_FIXED : + (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB, + }; + assert(offset == 0); + + /* Get the fake offset back */ + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap); + if (ret != 0) + return MAP_FAILED; + + /* And map it */ + void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + device->fd, gem_mmap.offset); + return map; +} + +static void* +anv_gem_mmap_legacy(struct anv_device *device, uint32_t gem_handle, + uint64_t offset, uint64_t size, uint32_t flags) +{ + assert(!device->info->has_local_mem); + + struct drm_i915_gem_mmap gem_mmap = { + .handle = gem_handle, + .offset = offset, + .size = size, + .flags = flags, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap); + if (ret != 0) + return MAP_FAILED; + + return (void *)(uintptr_t) gem_mmap.addr_ptr; +} + +/** + * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error. + */ +void* +anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, + uint64_t offset, uint64_t size, uint32_t flags) +{ + void *map; + if (device->physical->has_mmap_offset) + map = anv_gem_mmap_offset(device, gem_handle, offset, size, flags); + else + map = anv_gem_mmap_legacy(device, gem_handle, offset, size, flags); + + if (map != MAP_FAILED) + VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1)); + + return map; +} + +/* This is just a wrapper around munmap, but it also notifies valgrind that + * this map is no longer valid. Pair this with anv_gem_mmap(). + */ +void +anv_gem_munmap(struct anv_device *device, void *p, uint64_t size) +{ + VG(VALGRIND_FREELIKE_BLOCK(p, 0)); + munmap(p, size); +} + +uint32_t +anv_gem_userptr(struct anv_device *device, void *mem, size_t size) +{ + struct drm_i915_gem_userptr userptr = { + .user_ptr = (__u64)((unsigned long) mem), + .user_size = size, + .flags = 0, + }; + + if (device->physical->has_userptr_probe) + userptr.flags |= I915_USERPTR_PROBE; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr); + if (ret == -1) + return 0; + + return userptr.handle; +} + +int +anv_gem_set_caching(struct anv_device *device, + uint32_t gem_handle, uint32_t caching) +{ + struct drm_i915_gem_caching gem_caching = { + .handle = gem_handle, + .caching = caching, + }; + + return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching); +} + +/** + * On error, \a timeout_ns holds the remaining time. + */ +int +anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns) +{ + struct drm_i915_gem_wait wait = { + .bo_handle = gem_handle, + .timeout_ns = *timeout_ns, + .flags = 0, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait); + *timeout_ns = wait.timeout_ns; + + return ret; +} + +int +anv_gem_execbuffer(struct anv_device *device, + struct drm_i915_gem_execbuffer2 *execbuf) +{ + if (execbuf->flags & I915_EXEC_FENCE_OUT) + return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, execbuf); + else + return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf); +} + +/** Return -1 on error. */ +int +anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle) +{ + if (!device->info->has_tiling_uapi) + return -1; + + struct drm_i915_gem_get_tiling get_tiling = { + .handle = gem_handle, + }; + + /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING + * anymore, so we will need another way to get the tiling. Apparently this + * is only used in Android code, so we may need some other way to + * communicate the tiling mode. + */ + if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) { + assert(!"Failed to get BO tiling"); + return -1; + } + + return get_tiling.tiling_mode; +} + +int +anv_gem_set_tiling(struct anv_device *device, + uint32_t gem_handle, uint32_t stride, uint32_t tiling) +{ + int ret; + + /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So + * nothing needs to be done. + */ + if (!device->info->has_tiling_uapi) + return 0; + + /* set_tiling overwrites the input on the error path, so we have to open + * code intel_ioctl. + */ + do { + struct drm_i915_gem_set_tiling set_tiling = { + .handle = gem_handle, + .tiling_mode = tiling, + .stride = stride, + }; + + ret = ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling); + } while (ret == -1 && (errno == EINTR || errno == EAGAIN)); + + return ret; +} + +int +anv_gem_get_param(int fd, uint32_t param) +{ + int tmp; + + drm_i915_getparam_t gp = { + .param = param, + .value = &tmp, + }; + + int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp); + if (ret == 0) + return tmp; + + return 0; +} + +bool +anv_gem_has_context_priority(int fd, int priority) +{ + return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY, + priority); +} + +int +anv_gem_create_context(struct anv_device *device) +{ + struct drm_i915_gem_context_create create = { 0 }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create); + if (ret == -1) + return -1; + + return create.ctx_id; +} + +int +anv_gem_destroy_context(struct anv_device *device, int context) +{ + struct drm_i915_gem_context_destroy destroy = { + .ctx_id = context, + }; + + return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy); +} + +int +anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value) +{ + struct drm_i915_gem_context_param p = { + .ctx_id = context, + .param = param, + .value = value, + }; + int err = 0; + + if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p)) + err = -errno; + return err; +} + +int +anv_gem_context_get_reset_stats(int fd, int context, + uint32_t *active, uint32_t *pending) +{ + struct drm_i915_reset_stats stats = { + .ctx_id = context, + }; + + int ret = intel_ioctl(fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats); + if (ret == 0) { + *active = stats.batch_active; + *pending = stats.batch_pending; + } + + return ret; +} + +int +anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle) +{ + struct drm_prime_handle args = { + .handle = gem_handle, + .flags = DRM_CLOEXEC | DRM_RDWR, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); + if (ret == -1) + return -1; + + return args.fd; +} + +uint32_t +anv_gem_fd_to_handle(struct anv_device *device, int fd) +{ + struct drm_prime_handle args = { + .fd = fd, + }; + + int ret = intel_ioctl(device->fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &args); + if (ret == -1) + return 0; + + return args.handle; +} + +int +anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result) +{ + struct drm_i915_reg_read args = { + .offset = offset + }; + + int ret = intel_ioctl(fd, DRM_IOCTL_I915_REG_READ, &args); + + *result = args.val; + return ret; +} + +struct drm_i915_query_engine_info * +anv_gem_get_engine_info(int fd) +{ + return intel_i915_query_alloc(fd, DRM_I915_QUERY_ENGINE_INFO, NULL); +} diff --git a/src/intel/vulkan_hasvk/anv_gem_stubs.c b/src/intel/vulkan_hasvk/anv_gem_stubs.c new file mode 100644 index 00000000000..52767d6f3c0 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_gem_stubs.c @@ -0,0 +1,187 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "util/anon_file.h" +#include "anv_private.h" + +uint32_t +anv_gem_create(struct anv_device *device, uint64_t size) +{ + int fd = os_create_anonymous_file(size, "fake bo"); + if (fd == -1) + return 0; + + assert(fd != 0); + + return fd; +} + +void +anv_gem_close(struct anv_device *device, uint32_t gem_handle) +{ + close(gem_handle); +} + +uint32_t +anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size, + uint32_t flags, uint32_t num_regions, + struct drm_i915_gem_memory_class_instance *regions) +{ + return 0; +} + +void* +anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, + uint64_t offset, uint64_t size, uint32_t flags) +{ + /* Ignore flags, as they're specific to I915_GEM_MMAP. */ + (void) flags; + + return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + gem_handle, offset); +} + +/* This is just a wrapper around munmap, but it also notifies valgrind that + * this map is no longer valid. Pair this with anv_gem_mmap(). + */ +void +anv_gem_munmap(struct anv_device *device, void *p, uint64_t size) +{ + munmap(p, size); +} + +uint32_t +anv_gem_userptr(struct anv_device *device, void *mem, size_t size) +{ + int fd = os_create_anonymous_file(size, "fake bo"); + if (fd == -1) + return 0; + + assert(fd != 0); + + return fd; +} + +int +anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns) +{ + return 0; +} + +int +anv_gem_execbuffer(struct anv_device *device, + struct drm_i915_gem_execbuffer2 *execbuf) +{ + return 0; +} + +int +anv_gem_set_tiling(struct anv_device *device, + uint32_t gem_handle, uint32_t stride, uint32_t tiling) +{ + return 0; +} + +int +anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle) +{ + return 0; +} + +int +anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, + uint32_t caching) +{ + return 0; +} + +int +anv_gem_get_param(int fd, uint32_t param) +{ + unreachable("Unused"); +} + +int +anv_gem_create_context(struct anv_device *device) +{ + unreachable("Unused"); +} + +int +anv_gem_destroy_context(struct anv_device *device, int context) +{ + unreachable("Unused"); +} + +int +anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value) +{ + unreachable("Unused"); +} + +bool +anv_gem_has_context_priority(int fd, int priority) +{ + unreachable("Unused"); +} + +int +anv_gem_context_get_reset_stats(int fd, int context, + uint32_t *active, uint32_t *pending) +{ + unreachable("Unused"); +} + +int +anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle) +{ + unreachable("Unused"); +} + +uint32_t +anv_gem_fd_to_handle(struct anv_device *device, int fd) +{ + unreachable("Unused"); +} + +int +anv_i915_query(int fd, uint64_t query_id, void *buffer, + int32_t *buffer_len) +{ + unreachable("Unused"); +} + +struct drm_i915_query_engine_info * +anv_gem_get_engine_info(int fd) +{ + unreachable("Unused"); +} + +int +anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result) +{ + unreachable("Unused"); +} diff --git a/src/intel/vulkan_hasvk/anv_genX.h b/src/intel/vulkan_hasvk/anv_genX.h new file mode 100644 index 00000000000..102514d5e7d --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_genX.h @@ -0,0 +1,180 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* + * NOTE: The header can be included multiple times, from the same file. + */ + +/* + * Gen-specific function declarations. This header must *not* be included + * directly. Instead, it is included multiple times by anv_private.h. + * + * In this header file, the usual genx() macro is available. + */ + +#ifndef ANV_PRIVATE_H +#error This file is included by means other than anv_private.h +#endif + +struct intel_sample_positions; + +typedef struct VkRenderingSelfDependencyInfoMESA VkRenderingSelfDependencyInfoMESA; + +extern const uint32_t genX(vk_to_intel_cullmode)[]; + +extern const uint32_t genX(vk_to_intel_front_face)[]; + +extern const uint32_t genX(vk_to_intel_primitive_type)[]; + +extern const uint32_t genX(vk_to_intel_compare_op)[]; + +extern const uint32_t genX(vk_to_intel_stencil_op)[]; + +extern const uint32_t genX(vk_to_intel_logic_op)[]; + +void genX(init_physical_device_state)(struct anv_physical_device *device); + +VkResult genX(init_device_state)(struct anv_device *device); + +void genX(init_cps_device_state)(struct anv_device *device); + +void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer, + const struct isl_surf *surf); + +void genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size); +void genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used); + +void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale); + +void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); +void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); + +enum anv_pipe_bits +genX(emit_apply_pipe_flushes)(struct anv_batch *batch, + struct anv_device *device, + uint32_t current_pipeline, + enum anv_pipe_bits bits); + +void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, + struct anv_device *device, + struct anv_batch *batch); + +void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state); + +void genX(emit_so_memcpy)(struct anv_memcpy_state *state, + struct anv_address dst, struct anv_address src, + uint32_t size); + +void genX(emit_l3_config)(struct anv_batch *batch, + const struct anv_device *device, + const struct intel_l3_config *cfg); + +void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, + const struct intel_l3_config *cfg); + +void genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer); +void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer); + +void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, + bool enable); + +void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + uint32_t level, + uint32_t base_layer, + uint32_t layer_count); + +void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer); + +struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer); + +void +genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, + const struct intel_l3_config *l3_config, + VkShaderStageFlags active_stages, + const unsigned entry_size[4], + enum intel_urb_deref_block_size *deref_block_size); + +void genX(emit_multisample)(struct anv_batch *batch, uint32_t samples, + const struct vk_sample_locations_state *sl); + +void genX(emit_sample_pattern)(struct anv_batch *batch, + const struct vk_sample_locations_state *sl); + +void genX(emit_shading_rate)(struct anv_batch *batch, + const struct anv_graphics_pipeline *pipeline, + const struct vk_fragment_shading_rate_state *fsr); + +void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address dst, struct anv_address src, + uint32_t size); + +void genX(blorp_exec)(struct blorp_batch *batch, + const struct blorp_params *params); + +void genX(cmd_emit_timestamp)(struct anv_batch *batch, + struct anv_device *device, + struct anv_address addr, + bool end_of_pipe); + +void +genX(rasterization_mode)(VkPolygonMode raster_mode, + VkLineRasterizationModeEXT line_mode, + float line_width, + uint32_t *api_mode, + bool *msaa_rasterization_enable); + +uint32_t +genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline, + VkPolygonMode raster_mode); + +VkPolygonMode +genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline, + VkPrimitiveTopology primitive_topology); + +void +genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, + const struct vk_graphics_pipeline_state *state); + +void +genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline); + +void +genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline); diff --git a/src/intel/vulkan_hasvk/anv_image.c b/src/intel/vulkan_hasvk/anv_image.c new file mode 100644 index 00000000000..6fb8b43c6de --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_image.c @@ -0,0 +1,2973 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "drm-uapi/drm_fourcc.h" + +#include "anv_private.h" +#include "util/debug.h" +#include "vk_util.h" +#include "util/u_math.h" + +#include "vk_format.h" + +#define ANV_OFFSET_IMPLICIT UINT64_MAX + +static const enum isl_surf_dim +vk_to_isl_surf_dim[] = { + [VK_IMAGE_TYPE_1D] = ISL_SURF_DIM_1D, + [VK_IMAGE_TYPE_2D] = ISL_SURF_DIM_2D, + [VK_IMAGE_TYPE_3D] = ISL_SURF_DIM_3D, +}; + +static uint64_t MUST_CHECK UNUSED +memory_range_end(struct anv_image_memory_range memory_range) +{ + assert(anv_is_aligned(memory_range.offset, memory_range.alignment)); + return memory_range.offset + memory_range.size; +} + +/** + * Get binding for VkImagePlaneMemoryRequirementsInfo, + * VkBindImagePlaneMemoryInfo and VkDeviceImageMemoryRequirements. + */ +static struct anv_image_binding * +image_aspect_to_binding(struct anv_image *image, VkImageAspectFlags aspect) +{ + uint32_t plane; + + assert(image->disjoint); + + if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + /* Spec requires special aspects for modifier images. */ + assert(aspect >= VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT && + aspect <= VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT); + + /* We don't advertise DISJOINT for modifiers with aux, and therefore we + * don't handle queries of the modifier's "aux plane" here. + */ + assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod)); + + plane = aspect - VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT; + } else { + plane = anv_image_aspect_to_plane(image, aspect); + } + + return &image->bindings[ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane]; +} + +/** + * Extend the memory binding's range by appending a new memory range with `size` + * and `alignment` at `offset`. Return the appended range. + * + * Offset is ignored if ANV_OFFSET_IMPLICIT. + * + * The given binding must not be ANV_IMAGE_MEMORY_BINDING_MAIN. The function + * converts to MAIN as needed. + */ +static VkResult MUST_CHECK +image_binding_grow(const struct anv_device *device, + struct anv_image *image, + enum anv_image_memory_binding binding, + uint64_t offset, + uint64_t size, + uint32_t alignment, + struct anv_image_memory_range *out_range) +{ + /* We overwrite 'offset' but need to remember if it was implicit. */ + const bool has_implicit_offset = (offset == ANV_OFFSET_IMPLICIT); + + assert(size > 0); + assert(util_is_power_of_two_or_zero(alignment)); + + switch (binding) { + case ANV_IMAGE_MEMORY_BINDING_MAIN: + /* The caller must not pre-translate BINDING_PLANE_i to BINDING_MAIN. */ + unreachable("ANV_IMAGE_MEMORY_BINDING_MAIN"); + case ANV_IMAGE_MEMORY_BINDING_PLANE_0: + case ANV_IMAGE_MEMORY_BINDING_PLANE_1: + case ANV_IMAGE_MEMORY_BINDING_PLANE_2: + if (!image->disjoint) + binding = ANV_IMAGE_MEMORY_BINDING_MAIN; + break; + case ANV_IMAGE_MEMORY_BINDING_PRIVATE: + assert(offset == ANV_OFFSET_IMPLICIT); + break; + case ANV_IMAGE_MEMORY_BINDING_END: + unreachable("ANV_IMAGE_MEMORY_BINDING_END"); + } + + struct anv_image_memory_range *container = + &image->bindings[binding].memory_range; + + if (has_implicit_offset) { + offset = align_u64(container->offset + container->size, alignment); + } else { + /* Offset must be validated because it comes from + * VkImageDrmFormatModifierExplicitCreateInfoEXT. + */ + if (unlikely(!anv_is_aligned(offset, alignment))) { + return vk_errorf(device, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT, + "VkImageDrmFormatModifierExplicitCreateInfoEXT::" + "pPlaneLayouts[]::offset is misaligned"); + } + + /* We require that surfaces be added in memory-order. This simplifies the + * layout validation required by + * VkImageDrmFormatModifierExplicitCreateInfoEXT, + */ + if (unlikely(offset < container->size)) { + return vk_errorf(device, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT, + "VkImageDrmFormatModifierExplicitCreateInfoEXT::" + "pPlaneLayouts[]::offset is too small"); + } + } + + if (__builtin_add_overflow(offset, size, &container->size)) { + if (has_implicit_offset) { + assert(!"overflow"); + return vk_errorf(device, VK_ERROR_UNKNOWN, + "internal error: overflow in %s", __func__); + } else { + return vk_errorf(device, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT, + "VkImageDrmFormatModifierExplicitCreateInfoEXT::" + "pPlaneLayouts[]::offset is too large"); + } + } + + container->alignment = MAX2(container->alignment, alignment); + + *out_range = (struct anv_image_memory_range) { + .binding = binding, + .offset = offset, + .size = size, + .alignment = alignment, + }; + + return VK_SUCCESS; +} + +/** + * Adjust range 'a' to contain range 'b'. + * + * For simplicity's sake, the offset of 'a' must be 0 and remains 0. + * If 'a' and 'b' target different bindings, then no merge occurs. + */ +static void +memory_range_merge(struct anv_image_memory_range *a, + const struct anv_image_memory_range b) +{ + if (b.size == 0) + return; + + if (a->binding != b.binding) + return; + + assert(a->offset == 0); + assert(anv_is_aligned(a->offset, a->alignment)); + assert(anv_is_aligned(b.offset, b.alignment)); + + a->alignment = MAX2(a->alignment, b.alignment); + a->size = MAX2(a->size, b.offset + b.size); +} + +static isl_surf_usage_flags_t +choose_isl_surf_usage(VkImageCreateFlags vk_create_flags, + VkImageUsageFlags vk_usage, + isl_surf_usage_flags_t isl_extra_usage, + VkImageAspectFlagBits aspect) +{ + isl_surf_usage_flags_t isl_usage = isl_extra_usage; + + if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT) + isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT; + + if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) + isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT; + + if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) + isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; + + if (vk_usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) + isl_usage |= ISL_SURF_USAGE_CPB_BIT; + + if (vk_create_flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) + isl_usage |= ISL_SURF_USAGE_CUBE_BIT; + + /* Even if we're only using it for transfer operations, clears to depth and + * stencil images happen as depth and stencil so they need the right ISL + * usage bits or else things will fall apart. + */ + switch (aspect) { + case VK_IMAGE_ASPECT_DEPTH_BIT: + isl_usage |= ISL_SURF_USAGE_DEPTH_BIT; + break; + case VK_IMAGE_ASPECT_STENCIL_BIT: + isl_usage |= ISL_SURF_USAGE_STENCIL_BIT; + break; + case VK_IMAGE_ASPECT_COLOR_BIT: + case VK_IMAGE_ASPECT_PLANE_0_BIT: + case VK_IMAGE_ASPECT_PLANE_1_BIT: + case VK_IMAGE_ASPECT_PLANE_2_BIT: + break; + default: + unreachable("bad VkImageAspect"); + } + + if (vk_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) { + /* blorp implements transfers by sampling from the source image. */ + isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT; + } + + if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT && + aspect == VK_IMAGE_ASPECT_COLOR_BIT) { + /* blorp implements transfers by rendering into the destination image. + * Only request this with color images, as we deal with depth/stencil + * formats differently. */ + isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT; + } + + return isl_usage; +} + +static isl_tiling_flags_t +choose_isl_tiling_flags(const struct intel_device_info *devinfo, + const struct anv_image_create_info *anv_info, + const struct isl_drm_modifier_info *isl_mod_info, + bool legacy_scanout) +{ + const VkImageCreateInfo *base_info = anv_info->vk_info; + isl_tiling_flags_t flags = 0; + + assert((isl_mod_info != NULL) == + (base_info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)); + + switch (base_info->tiling) { + default: + unreachable("bad VkImageTiling"); + case VK_IMAGE_TILING_OPTIMAL: + flags = ISL_TILING_ANY_MASK; + break; + case VK_IMAGE_TILING_LINEAR: + flags = ISL_TILING_LINEAR_BIT; + break; + case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT: + flags = 1 << isl_mod_info->tiling; + } + + if (anv_info->isl_tiling_flags) { + assert(isl_mod_info == NULL); + flags &= anv_info->isl_tiling_flags; + } + + if (legacy_scanout) { + isl_tiling_flags_t legacy_mask = ISL_TILING_LINEAR_BIT; + if (devinfo->has_tiling_uapi) + legacy_mask |= ISL_TILING_X_BIT; + flags &= legacy_mask; + } + + assert(flags); + + return flags; +} + +/** + * Add the surface to the binding at the given offset. + * + * \see image_binding_grow() + */ +static VkResult MUST_CHECK +add_surface(struct anv_device *device, + struct anv_image *image, + struct anv_surface *surf, + enum anv_image_memory_binding binding, + uint64_t offset) +{ + /* isl surface must be initialized */ + assert(surf->isl.size_B > 0); + + return image_binding_grow(device, image, binding, offset, + surf->isl.size_B, + surf->isl.alignment_B, + &surf->memory_range); +} + +/** + * Do hardware limitations require the image plane to use a shadow surface? + * + * If hardware limitations force us to use a shadow surface, then the same + * limitations may also constrain the tiling of the primary surface; therefore + * parameter @a inout_primary_tiling_flags. + * + * If the image plane is a separate stencil plane and if the user provided + * VkImageStencilUsageCreateInfo, then @a usage must be stencilUsage. + * + * @see anv_image::planes[]::shadow_surface + */ +static bool +anv_image_plane_needs_shadow_surface(const struct intel_device_info *devinfo, + struct anv_format_plane plane_format, + VkImageTiling vk_tiling, + VkImageUsageFlags vk_plane_usage, + VkImageCreateFlags vk_create_flags, + isl_tiling_flags_t *inout_primary_tiling_flags) +{ + if (devinfo->ver <= 8 && + (vk_create_flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) && + vk_tiling == VK_IMAGE_TILING_OPTIMAL) { + /* We must fallback to a linear surface because we may not be able to + * correctly handle the offsets if tiled. (On gfx9, + * RENDER_SURFACE_STATE::X/Y Offset are sufficient). To prevent garbage + * performance while texturing, we maintain a tiled shadow surface. + */ + assert(isl_format_is_compressed(plane_format.isl_format)); + + if (inout_primary_tiling_flags) { + *inout_primary_tiling_flags = ISL_TILING_LINEAR_BIT; + } + + return true; + } + + if (devinfo->ver <= 7 && + plane_format.aspect == VK_IMAGE_ASPECT_STENCIL_BIT && + (vk_plane_usage & (VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))) { + /* gfx7 can't sample from W-tiled surfaces. */ + return true; + } + + return false; +} + +static bool +can_fast_clear_with_non_zero_color(const struct intel_device_info *devinfo, + const struct anv_image *image, + uint32_t plane, + const VkImageFormatListCreateInfo *fmt_list) +{ + /* If we don't have an AUX surface where fast clears apply, we can return + * early. + */ + if (!isl_aux_usage_has_fast_clears(image->planes[plane].aux_usage)) + return false; + + /* On TGL, if a block of fragment shader outputs match the surface's clear + * color, the HW may convert them to fast-clears (see HSD 14010672564). + * This can lead to rendering corruptions if not handled properly. We + * restrict the clear color to zero to avoid issues that can occur with: + * - Texture view rendering (including blorp_copy calls) + * - Images with multiple levels or array layers + */ + if (devinfo->ver >= 12 && + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) + return false; + + /* Non mutable image, we can fast clear with any color supported by HW. + */ + if (!(image->vk.create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT)) + return true; + + /* Mutable image with no format list, we have to assume all formats */ + if (!fmt_list || fmt_list->viewFormatCount == 0) + return false; + + enum isl_format img_format = image->planes[plane].primary_surface.isl.format; + + /* Check bit compatibility for clear color components */ + for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { + struct anv_format_plane view_format_plane = + anv_get_format_plane(devinfo, fmt_list->pViewFormats[i], + plane, image->vk.tiling); + + enum isl_format view_format = view_format_plane.isl_format; + + if (!isl_formats_have_same_bits_per_channel(img_format, view_format)) + return false; + + /* Switching between any of those format types on Gfx7/8 will cause + * problems https://gitlab.freedesktop.org/mesa/mesa/-/issues/1711 + */ + if (devinfo->ver <= 8) { + if (isl_format_has_float_channel(img_format) && + !isl_format_has_float_channel(view_format)) + return false; + + if (isl_format_has_int_channel(img_format) && + !isl_format_has_int_channel(view_format)) + return false; + + if (isl_format_has_unorm_channel(img_format) && + !isl_format_has_unorm_channel(view_format)) + return false; + + if (isl_format_has_snorm_channel(img_format) && + !isl_format_has_snorm_channel(view_format)) + return false; + } + } + + return true; +} + +/** + * Return true if the storage image could be used with atomics. + * + * If the image was created with an explicit format, we check it for typed + * atomic support. If MUTABLE_FORMAT_BIT is set, then we check the optional + * format list, seeing if /any/ of the formats support typed atomics. If no + * list is supplied, we fall back to using the bpb, as the application could + * make an image view with a format that does use atomics. + */ +static bool +storage_image_format_supports_atomic(const struct intel_device_info *devinfo, + VkImageCreateFlags create_flags, + enum isl_format format, + VkImageTiling vk_tiling, + const VkImageFormatListCreateInfo *fmt_list) +{ + if (isl_format_supports_typed_atomics(devinfo, format)) + return true; + + if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT)) + return false; + + if (fmt_list) { + for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { + enum isl_format view_format = + anv_get_isl_format(devinfo, fmt_list->pViewFormats[i], + VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling); + + if (isl_format_supports_typed_atomics(devinfo, view_format)) + return true; + } + + return false; + } + + /* No explicit format list. Any 16/32/64bpp format could be used with atomics. */ + unsigned bpb = isl_format_get_layout(format)->bpb; + return bpb == 16 || bpb == 32 || bpb == 64; +} + +static enum isl_format +anv_get_isl_format_with_usage(const struct intel_device_info *devinfo, + VkFormat vk_format, + VkImageAspectFlagBits vk_aspect, + VkImageUsageFlags vk_usage, + VkImageTiling vk_tiling) +{ + assert(util_bitcount(vk_usage) == 1); + struct anv_format_plane format = + anv_get_format_aspect(devinfo, vk_format, vk_aspect, + vk_tiling); + + if ((vk_usage == VK_IMAGE_USAGE_STORAGE_BIT) && + isl_is_storage_image_format(format.isl_format)) { + enum isl_format lowered_format = + isl_lower_storage_image_format(devinfo, format.isl_format); + + /* If we lower the format, we should ensure either they both match in + * bits per channel or that there is no swizzle, because we can't use + * the swizzle for a different bit pattern. + */ + assert(isl_formats_have_same_bits_per_channel(lowered_format, + format.isl_format) || + isl_swizzle_is_identity(format.swizzle)); + + format.isl_format = lowered_format; + } + + return format.isl_format; +} + +static bool +formats_ccs_e_compatible(const struct intel_device_info *devinfo, + VkImageCreateFlags create_flags, + enum isl_format format, VkImageTiling vk_tiling, + VkImageUsageFlags vk_usage, + const VkImageFormatListCreateInfo *fmt_list) +{ + if (!isl_format_supports_ccs_e(devinfo, format)) + return false; + + /* For images created without MUTABLE_FORMAT_BIT set, we know that they will + * always be used with the original format. In particular, they will always + * be used with a format that supports color compression. + */ + if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT)) + return true; + + if (!fmt_list || fmt_list->viewFormatCount == 0) + return false; + + for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) { + enum isl_format view_format = + anv_get_isl_format_with_usage(devinfo, fmt_list->pViewFormats[i], + VK_IMAGE_ASPECT_COLOR_BIT, vk_usage, + vk_tiling); + + if (!isl_formats_are_ccs_e_compatible(devinfo, format, view_format)) + return false; + } + + return true; +} + +bool +anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo, + VkImageCreateFlags create_flags, + VkFormat vk_format, VkImageTiling vk_tiling, + VkImageUsageFlags vk_usage, + const VkImageFormatListCreateInfo *fmt_list) +{ + enum isl_format format = + anv_get_isl_format_with_usage(devinfo, vk_format, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_SAMPLED_BIT, vk_tiling); + + if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling, + VK_IMAGE_USAGE_SAMPLED_BIT, fmt_list)) + return false; + + if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT) { + if (devinfo->verx10 < 125) + return false; + + enum isl_format lower_format = + anv_get_isl_format_with_usage(devinfo, vk_format, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_STORAGE_BIT, vk_tiling); + + if (!isl_formats_are_ccs_e_compatible(devinfo, format, lower_format)) + return false; + + if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling, + VK_IMAGE_USAGE_STORAGE_BIT, fmt_list)) + return false; + + /* Disable compression when surface can be potentially used for atomic + * operation. + */ + if (storage_image_format_supports_atomic(devinfo, create_flags, format, + vk_tiling, fmt_list)) + return false; + } + + return true; +} + +/** + * For color images that have an auxiliary surface, request allocation for an + * additional buffer that mainly stores fast-clear values. Use of this buffer + * allows us to access the image's subresources while being aware of their + * fast-clear values in non-trivial cases (e.g., outside of a render pass in + * which a fast clear has occurred). + * + * In order to avoid having multiple clear colors for a single plane of an + * image (hence a single RENDER_SURFACE_STATE), we only allow fast-clears on + * the first slice (level 0, layer 0). At the time of our testing (Jan 17, + * 2018), there were no known applications which would benefit from fast- + * clearing more than just the first slice. + * + * The fast clear portion of the image is laid out in the following order: + * + * * 1 or 4 dwords (depending on hardware generation) for the clear color + * * 1 dword for the anv_fast_clear_type of the clear color + * * On gfx9+, 1 dword per level and layer of the image (3D levels count + * multiple layers) in level-major order for compression state. + * + * For the purpose of discoverability, the algorithm used to manage + * compression and fast-clears is described here: + * + * * On a transition from UNDEFINED or PREINITIALIZED to a defined layout, + * all of the values in the fast clear portion of the image are initialized + * to default values. + * + * * On fast-clear, the clear value is written into surface state and also + * into the buffer and the fast clear type is set appropriately. Both + * setting the fast-clear value in the buffer and setting the fast-clear + * type happen from the GPU using MI commands. + * + * * Whenever a render or blorp operation is performed with CCS_E, we call + * genX(cmd_buffer_mark_image_written) to set the compression state to + * true (which is represented by UINT32_MAX). + * + * * On pipeline barrier transitions, the worst-case transition is computed + * from the image layouts. The command streamer inspects the fast clear + * type and compression state dwords and constructs a predicate. The + * worst-case resolve is performed with the given predicate and the fast + * clear and compression state is set accordingly. + * + * See anv_layout_to_aux_usage and anv_layout_to_fast_clear_type functions for + * details on exactly what is allowed in what layouts. + * + * On gfx7-9, we do not have a concept of indirect clear colors in hardware. + * In order to deal with this, we have to do some clear color management. + * + * * For LOAD_OP_LOAD at the top of a renderpass, we have to copy the clear + * value from the buffer into the surface state with MI commands. + * + * * For any blorp operations, we pass the address to the clear value into + * blorp and it knows to copy the clear color. + */ +static VkResult MUST_CHECK +add_aux_state_tracking_buffer(struct anv_device *device, + struct anv_image *image, + uint32_t plane) +{ + assert(image && device); + assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE && + image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV | + VK_IMAGE_ASPECT_DEPTH_BIT)); + + const unsigned clear_color_state_size = device->info->ver >= 10 ? + device->isl_dev.ss.clear_color_state_size : + device->isl_dev.ss.clear_value_size; + + /* Clear color and fast clear type */ + unsigned state_size = clear_color_state_size + 4; + + /* We only need to track compression on CCS_E surfaces. */ + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + for (uint32_t l = 0; l < image->vk.mip_levels; l++) + state_size += anv_minify(image->vk.extent.depth, l) * 4; + } else { + state_size += image->vk.mip_levels * image->vk.array_layers * 4; + } + } + + enum anv_image_memory_binding binding = + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane; + + /* If an auxiliary surface is used for an externally-shareable image, + * we have to hide this from the memory of the image since other + * processes with access to the memory may not be aware of it or of + * its current state. So put that auxiliary data into a separate + * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE). + */ + if (anv_image_is_externally_shared(image)) { + binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE; + } + + /* We believe that 256B alignment may be sufficient, but we choose 4K due to + * lack of testing. And MI_LOAD/STORE operations require dword-alignment. + */ + return image_binding_grow(device, image, binding, + ANV_OFFSET_IMPLICIT, state_size, 4096, + &image->planes[plane].fast_clear_memory_range); +} + +/** + * The return code indicates whether creation of the VkImage should continue + * or fail, not whether the creation of the aux surface succeeded. If the aux + * surface is not required (for example, by neither hardware nor DRM format + * modifier), then this may return VK_SUCCESS when creation of the aux surface + * fails. + * + * @param offset See add_surface() + */ +static VkResult +add_aux_surface_if_supported(struct anv_device *device, + struct anv_image *image, + uint32_t plane, + struct anv_format_plane plane_format, + const VkImageFormatListCreateInfo *fmt_list, + uint64_t offset, + uint32_t stride, + isl_surf_usage_flags_t isl_extra_usage_flags) +{ + VkImageAspectFlags aspect = plane_format.aspect; + VkResult result; + bool ok; + + /* The aux surface must not be already added. */ + assert(!anv_surface_is_valid(&image->planes[plane].aux_surface)); + + if ((isl_extra_usage_flags & ISL_SURF_USAGE_DISABLE_AUX_BIT)) + return VK_SUCCESS; + + if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { + /* We don't advertise that depth buffers could be used as storage + * images. + */ + assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)); + + /* Allow the user to control HiZ enabling. Disable by default on gfx7 + * because resolves are not currently implemented pre-BDW. + */ + if (!(image->vk.usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) { + /* It will never be used as an attachment, HiZ is pointless. */ + return VK_SUCCESS; + } + + if (device->info->ver == 7) { + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Implement gfx7 HiZ"); + return VK_SUCCESS; + } + + if (image->vk.mip_levels > 1) { + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Enable multi-LOD HiZ"); + return VK_SUCCESS; + } + + if (device->info->ver == 8 && image->vk.samples > 1) { + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), + "Enable gfx8 multisampled HiZ"); + return VK_SUCCESS; + } + + if (INTEL_DEBUG(DEBUG_NO_HIZ)) + return VK_SUCCESS; + + ok = isl_surf_get_hiz_surf(&device->isl_dev, + &image->planes[plane].primary_surface.isl, + &image->planes[plane].aux_surface.isl); + if (!ok) + return VK_SUCCESS; + + if (!isl_surf_supports_ccs(&device->isl_dev, + &image->planes[plane].primary_surface.isl, + &image->planes[plane].aux_surface.isl)) { + image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ; + } else if (image->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) && + image->vk.samples == 1) { + /* If it's used as an input attachment or a texture and it's + * single-sampled (this is a requirement for HiZ+CCS write-through + * mode), use write-through mode so that we don't need to resolve + * before texturing. This will make depth testing a bit slower but + * texturing faster. + * + * TODO: This is a heuristic trade-off; we haven't tuned it at all. + */ + assert(device->info->ver >= 12); + image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS_WT; + } else { + assert(device->info->ver >= 12); + image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS; + } + + result = add_surface(device, image, &image->planes[plane].aux_surface, + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane, + ANV_OFFSET_IMPLICIT); + if (result != VK_SUCCESS) + return result; + + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) + return add_aux_state_tracking_buffer(device, image, plane); + } else if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) { + + if (INTEL_DEBUG(DEBUG_NO_CCS)) + return VK_SUCCESS; + + if (!isl_surf_supports_ccs(&device->isl_dev, + &image->planes[plane].primary_surface.isl, + NULL)) + return VK_SUCCESS; + + image->planes[plane].aux_usage = ISL_AUX_USAGE_STC_CCS; + } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples == 1) { + if (image->n_planes != 1) { + /* Multiplanar images seem to hit a sampler bug with CCS and R16G16 + * format. (Putting the clear state a page/4096bytes further fixes + * the issue). + */ + return VK_SUCCESS; + } + + if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT)) { + /* The image may alias a plane of a multiplanar image. Above we ban + * CCS on multiplanar images. + * + * We must also reject aliasing of any image that uses + * ANV_IMAGE_MEMORY_BINDING_PRIVATE. Since we're already rejecting all + * aliasing here, there's no need to further analyze if the image needs + * a private binding. + */ + return VK_SUCCESS; + } + + if (INTEL_DEBUG(DEBUG_NO_CCS)) + return VK_SUCCESS; + + ok = isl_surf_get_ccs_surf(&device->isl_dev, + &image->planes[plane].primary_surface.isl, + NULL, + &image->planes[plane].aux_surface.isl, + stride); + if (!ok) + return VK_SUCCESS; + + /* Choose aux usage */ + if (anv_formats_ccs_e_compatible(device->info, image->vk.create_flags, + image->vk.format, image->vk.tiling, + image->vk.usage, fmt_list)) { + image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E; + } else if (device->info->ver >= 12) { + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), + "The CCS_D aux mode is not yet handled on " + "Gfx12+. Not allocating a CCS buffer."); + image->planes[plane].aux_surface.isl.size_B = 0; + return VK_SUCCESS; + } else { + image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D; + } + + if (!device->physical->has_implicit_ccs) { + enum anv_image_memory_binding binding = + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane; + + if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID && + !isl_drm_modifier_has_aux(image->vk.drm_format_mod)) + binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE; + + result = add_surface(device, image, &image->planes[plane].aux_surface, + binding, offset); + if (result != VK_SUCCESS) + return result; + } + + return add_aux_state_tracking_buffer(device, image, plane); + } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples > 1) { + assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)); + ok = isl_surf_get_mcs_surf(&device->isl_dev, + &image->planes[plane].primary_surface.isl, + &image->planes[plane].aux_surface.isl); + if (!ok) + return VK_SUCCESS; + + image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS; + + result = add_surface(device, image, &image->planes[plane].aux_surface, + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane, + ANV_OFFSET_IMPLICIT); + if (result != VK_SUCCESS) + return result; + + return add_aux_state_tracking_buffer(device, image, plane); + } + + return VK_SUCCESS; +} + +static VkResult +add_shadow_surface(struct anv_device *device, + struct anv_image *image, + uint32_t plane, + struct anv_format_plane plane_format, + uint32_t stride, + VkImageUsageFlags vk_plane_usage) +{ + ASSERTED bool ok; + + ok = isl_surf_init(&device->isl_dev, + &image->planes[plane].shadow_surface.isl, + .dim = vk_to_isl_surf_dim[image->vk.image_type], + .format = plane_format.isl_format, + .width = image->vk.extent.width, + .height = image->vk.extent.height, + .depth = image->vk.extent.depth, + .levels = image->vk.mip_levels, + .array_len = image->vk.array_layers, + .samples = image->vk.samples, + .min_alignment_B = 0, + .row_pitch_B = stride, + .usage = ISL_SURF_USAGE_TEXTURE_BIT | + (vk_plane_usage & ISL_SURF_USAGE_CUBE_BIT), + .tiling_flags = ISL_TILING_ANY_MASK); + + /* isl_surf_init() will fail only if provided invalid input. Invalid input + * here is illegal in Vulkan. + */ + assert(ok); + + return add_surface(device, image, &image->planes[plane].shadow_surface, + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane, + ANV_OFFSET_IMPLICIT); +} + +/** + * Initialize the anv_image::*_surface selected by \a aspect. Then update the + * image's memory requirements (that is, the image's size and alignment). + * + * @param offset See add_surface() + */ +static VkResult +add_primary_surface(struct anv_device *device, + struct anv_image *image, + uint32_t plane, + struct anv_format_plane plane_format, + uint64_t offset, + uint32_t stride, + isl_tiling_flags_t isl_tiling_flags, + isl_surf_usage_flags_t isl_usage) +{ + struct anv_surface *anv_surf = &image->planes[plane].primary_surface; + bool ok; + + ok = isl_surf_init(&device->isl_dev, &anv_surf->isl, + .dim = vk_to_isl_surf_dim[image->vk.image_type], + .format = plane_format.isl_format, + .width = image->vk.extent.width / plane_format.denominator_scales[0], + .height = image->vk.extent.height / plane_format.denominator_scales[1], + .depth = image->vk.extent.depth, + .levels = image->vk.mip_levels, + .array_len = image->vk.array_layers, + .samples = image->vk.samples, + .min_alignment_B = 0, + .row_pitch_B = stride, + .usage = isl_usage, + .tiling_flags = isl_tiling_flags); + + if (!ok) { + /* TODO: Should return + * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT in come cases. + */ + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + image->planes[plane].aux_usage = ISL_AUX_USAGE_NONE; + + return add_surface(device, image, anv_surf, + ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane, offset); +} + +#ifndef NDEBUG +static bool MUST_CHECK +memory_range_is_aligned(struct anv_image_memory_range memory_range) +{ + return anv_is_aligned(memory_range.offset, memory_range.alignment); +} + +static bool MUST_CHECK +memory_ranges_equal(struct anv_image_memory_range a, + struct anv_image_memory_range b) +{ + return a.binding == b.binding && + a.offset == b.offset && + a.size == b.size && + a.alignment == b.alignment; +} +#endif + +struct check_memory_range_params { + struct anv_image_memory_range *accum_ranges; + const struct anv_surface *test_surface; + const struct anv_image_memory_range *test_range; + enum anv_image_memory_binding expect_binding; +}; + +#define check_memory_range(...) \ + check_memory_range_s(&(struct check_memory_range_params) { __VA_ARGS__ }) + +static void UNUSED +check_memory_range_s(const struct check_memory_range_params *p) +{ + assert((p->test_surface == NULL) != (p->test_range == NULL)); + + const struct anv_image_memory_range *test_range = + p->test_range ?: &p->test_surface->memory_range; + + struct anv_image_memory_range *accum_range = + &p->accum_ranges[p->expect_binding]; + + assert(test_range->binding == p->expect_binding); + assert(test_range->offset >= memory_range_end(*accum_range)); + assert(memory_range_is_aligned(*test_range)); + + if (p->test_surface) { + assert(anv_surface_is_valid(p->test_surface)); + assert(p->test_surface->memory_range.alignment == + p->test_surface->isl.alignment_B); + } + + memory_range_merge(accum_range, *test_range); +} + +/** + * Validate the image's memory bindings *after* all its surfaces and memory + * ranges are final. + * + * For simplicity's sake, we do not validate free-form layout of the image's + * memory bindings. We validate the layout described in the comments of struct + * anv_image. + */ +static void +check_memory_bindings(const struct anv_device *device, + const struct anv_image *image) +{ +#ifdef DEBUG + /* As we inspect each part of the image, we merge the part's memory range + * into these accumulation ranges. + */ + struct anv_image_memory_range accum_ranges[ANV_IMAGE_MEMORY_BINDING_END]; + for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; ++i) { + accum_ranges[i] = (struct anv_image_memory_range) { + .binding = i, + }; + } + + for (uint32_t p = 0; p < image->n_planes; ++p) { + const struct anv_image_plane *plane = &image->planes[p]; + + /* The binding that must contain the plane's primary surface. */ + const enum anv_image_memory_binding primary_binding = image->disjoint + ? ANV_IMAGE_MEMORY_BINDING_PLANE_0 + p + : ANV_IMAGE_MEMORY_BINDING_MAIN; + + /* Aliasing is incompatible with the private binding because it does not + * live in a VkDeviceMemory. The one exception is swapchain images. + */ + assert(!(image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) || + image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].memory_range.size == 0); + + /* Check primary surface */ + check_memory_range(accum_ranges, + .test_surface = &plane->primary_surface, + .expect_binding = primary_binding); + + /* Check shadow surface */ + if (anv_surface_is_valid(&plane->shadow_surface)) { + check_memory_range(accum_ranges, + .test_surface = &plane->shadow_surface, + .expect_binding = primary_binding); + } + + /* Check aux_surface */ + if (anv_surface_is_valid(&plane->aux_surface)) { + enum anv_image_memory_binding binding = primary_binding; + + /* If an auxiliary surface is used for an externally-shareable image, + * we have to hide this from the memory of the image since other + * processes with access to the memory may not be aware of it or of + * its current state. So put that auxiliary data into a separate + * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE). + */ + if (anv_image_is_externally_shared(image) && + !isl_drm_modifier_has_aux(image->vk.drm_format_mod)) { + binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE; + } + + /* Display hardware requires that the aux surface start at + * a higher address than the primary surface. The 3D hardware + * doesn't care, but we enforce the display requirement in case + * the image is sent to display. + */ + check_memory_range(accum_ranges, + .test_surface = &plane->aux_surface, + .expect_binding = binding); + } + + /* Check fast clear state */ + if (plane->fast_clear_memory_range.size > 0) { + enum anv_image_memory_binding binding = primary_binding; + + /* If an auxiliary surface is used for an externally-shareable image, + * we have to hide this from the memory of the image since other + * processes with access to the memory may not be aware of it or of + * its current state. So put that auxiliary data into a separate + * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE). + */ + if (anv_image_is_externally_shared(image)) { + binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE; + } + + /* We believe that 256B alignment may be sufficient, but we choose 4K + * due to lack of testing. And MI_LOAD/STORE operations require + * dword-alignment. + */ + assert(plane->fast_clear_memory_range.alignment == 4096); + check_memory_range(accum_ranges, + .test_range = &plane->fast_clear_memory_range, + .expect_binding = binding); + } + } +#endif +} + +/** + * Check that the fully-initialized anv_image is compatible with its DRM format + * modifier. + * + * Checking compatibility at the end of image creation is prudent, not + * superfluous, because usage of modifiers triggers numerous special cases + * throughout queries and image creation, and because + * vkGetPhysicalDeviceImageFormatProperties2 has difficulty detecting all + * incompatibilities. + * + * Return VK_ERROR_UNKNOWN if the incompatibility is difficult to detect in + * vkGetPhysicalDeviceImageFormatProperties2. Otherwise, assert fail. + * + * Ideally, if vkGetPhysicalDeviceImageFormatProperties2() succeeds with a given + * modifier, then vkCreateImage() produces an image that is compatible with the + * modifier. However, it is difficult to reconcile the two functions to agree + * due to their complexity. For example, isl_surf_get_ccs_surf() may + * unexpectedly fail in vkCreateImage(), eliminating the image's aux surface + * even when the modifier requires one. (Maybe we should reconcile the two + * functions despite the difficulty). + */ +static VkResult MUST_CHECK +check_drm_format_mod(const struct anv_device *device, + const struct anv_image *image) +{ + /* Image must have a modifier if and only if it has modifier tiling. */ + assert((image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID) == + (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)); + + if (image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID) + return VK_SUCCESS; + + const struct isl_drm_modifier_info *isl_mod_info = + isl_drm_modifier_get_info(image->vk.drm_format_mod); + + /* Driver must support the modifier. */ + assert(isl_drm_modifier_get_score(device->info, isl_mod_info->modifier)); + + /* Enforced by us, not the Vulkan spec. */ + assert(image->vk.image_type == VK_IMAGE_TYPE_2D); + assert(!(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)); + assert(!(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)); + assert(image->vk.mip_levels == 1); + assert(image->vk.array_layers == 1); + assert(image->vk.samples == 1); + + for (int i = 0; i < image->n_planes; ++i) { + const struct anv_image_plane *plane = &image->planes[i]; + ASSERTED const struct isl_format_layout *isl_layout = + isl_format_get_layout(plane->primary_surface.isl.format); + + /* Enforced by us, not the Vulkan spec. */ + assert(isl_layout->txc == ISL_TXC_NONE); + assert(isl_layout->colorspace == ISL_COLORSPACE_LINEAR || + isl_layout->colorspace == ISL_COLORSPACE_SRGB); + assert(!anv_surface_is_valid(&plane->shadow_surface)); + + if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) { + /* Reject DISJOINT for consistency with the GL driver. */ + assert(!image->disjoint); + + /* The modifier's required aux usage mandates the image's aux usage. + * The inverse, however, does not hold; if the modifier has no aux + * usage, then we may enable a private aux surface. + */ + if (plane->aux_usage != isl_mod_info->aux_usage) { + return vk_errorf(device, VK_ERROR_UNKNOWN, + "image with modifier unexpectedly has wrong aux " + "usage"); + } + } + } + + return VK_SUCCESS; +} + +/** + * Use when the app does not provide + * VkImageDrmFormatModifierExplicitCreateInfoEXT. + */ +static VkResult MUST_CHECK +add_all_surfaces_implicit_layout( + struct anv_device *device, + struct anv_image *image, + const VkImageFormatListCreateInfo *format_list_info, + uint32_t stride, + isl_tiling_flags_t isl_tiling_flags, + isl_surf_usage_flags_t isl_extra_usage_flags) +{ + const struct intel_device_info *devinfo = device->info; + VkResult result; + + u_foreach_bit(b, image->vk.aspects) { + VkImageAspectFlagBits aspect = 1 << b; + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + const struct anv_format_plane plane_format = + anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling); + + VkImageUsageFlags vk_usage = vk_image_usage(&image->vk, aspect); + isl_surf_usage_flags_t isl_usage = + choose_isl_surf_usage(image->vk.create_flags, vk_usage, + isl_extra_usage_flags, aspect); + + /* Must call this before adding any surfaces because it may modify + * isl_tiling_flags. + */ + bool needs_shadow = + anv_image_plane_needs_shadow_surface(devinfo, plane_format, + image->vk.tiling, vk_usage, + image->vk.create_flags, + &isl_tiling_flags); + + result = add_primary_surface(device, image, plane, plane_format, + ANV_OFFSET_IMPLICIT, stride, + isl_tiling_flags, isl_usage); + if (result != VK_SUCCESS) + return result; + + if (needs_shadow) { + result = add_shadow_surface(device, image, plane, plane_format, + stride, vk_usage); + if (result != VK_SUCCESS) + return result; + } + + /* Disable aux if image supports export without modifiers. */ + if (image->vk.external_handle_types != 0 && + image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) + continue; + + result = add_aux_surface_if_supported(device, image, plane, plane_format, + format_list_info, + ANV_OFFSET_IMPLICIT, stride, + isl_extra_usage_flags); + if (result != VK_SUCCESS) + return result; + } + + return VK_SUCCESS; +} + +/** + * Use when the app provides VkImageDrmFormatModifierExplicitCreateInfoEXT. + */ +static VkResult +add_all_surfaces_explicit_layout( + struct anv_device *device, + struct anv_image *image, + const VkImageFormatListCreateInfo *format_list_info, + const VkImageDrmFormatModifierExplicitCreateInfoEXT *drm_info, + isl_tiling_flags_t isl_tiling_flags, + isl_surf_usage_flags_t isl_extra_usage_flags) +{ + const struct intel_device_info *devinfo = device->info; + const uint32_t mod_plane_count = drm_info->drmFormatModifierPlaneCount; + const bool mod_has_aux = + isl_drm_modifier_has_aux(drm_info->drmFormatModifier); + VkResult result; + + /* About valid usage in the Vulkan spec: + * + * Unlike vanilla vkCreateImage, which produces undefined behavior on user + * error, here the spec requires the implementation to return + * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides + * a bad plane layout. However, the spec does require + * drmFormatModifierPlaneCount to be valid. + * + * Most validation of plane layout occurs in add_surface(). + */ + + /* We support a restricted set of images with modifiers. + * + * With aux usage, + * - Format plane count must be 1. + * - Memory plane count must be 2. + * Without aux usage, + * - Each format plane must map to a distint memory plane. + * + * For the other cases, currently there is no way to properly map memory + * planes to format planes and aux planes due to the lack of defined ABI + * for external multi-planar images. + */ + if (image->n_planes == 1) + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + else + assert(!(image->vk.aspects & ~VK_IMAGE_ASPECT_PLANES_BITS_ANV)); + + if (mod_has_aux) + assert(image->n_planes == 1 && mod_plane_count == 2); + else + assert(image->n_planes == mod_plane_count); + + /* Reject special values in the app-provided plane layouts. */ + for (uint32_t i = 0; i < mod_plane_count; ++i) { + if (drm_info->pPlaneLayouts[i].rowPitch == 0) { + return vk_errorf(device, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT, + "VkImageDrmFormatModifierExplicitCreateInfoEXT::" + "pPlaneLayouts[%u]::rowPitch is 0", i); + } + + if (drm_info->pPlaneLayouts[i].offset == ANV_OFFSET_IMPLICIT) { + return vk_errorf(device, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT, + "VkImageDrmFormatModifierExplicitCreateInfoEXT::" + "pPlaneLayouts[%u]::offset is %" PRIu64, + i, ANV_OFFSET_IMPLICIT); + } + } + + u_foreach_bit(b, image->vk.aspects) { + const VkImageAspectFlagBits aspect = 1 << b; + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + const struct anv_format_plane format_plane = + anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling); + const VkSubresourceLayout *primary_layout = &drm_info->pPlaneLayouts[plane]; + + result = add_primary_surface(device, image, plane, + format_plane, + primary_layout->offset, + primary_layout->rowPitch, + isl_tiling_flags, + isl_extra_usage_flags); + if (result != VK_SUCCESS) + return result; + + if (mod_has_aux) { + const VkSubresourceLayout *aux_layout = &drm_info->pPlaneLayouts[1]; + result = add_aux_surface_if_supported(device, image, plane, + format_plane, + format_list_info, + aux_layout->offset, + aux_layout->rowPitch, + isl_extra_usage_flags); + if (result != VK_SUCCESS) + return result; + } + } + + return VK_SUCCESS; +} + +static const struct isl_drm_modifier_info * +choose_drm_format_mod(const struct anv_physical_device *device, + uint32_t modifier_count, const uint64_t *modifiers) +{ + uint64_t best_mod = UINT64_MAX; + uint32_t best_score = 0; + + for (uint32_t i = 0; i < modifier_count; ++i) { + uint32_t score = isl_drm_modifier_get_score(&device->info, modifiers[i]); + if (score > best_score) { + best_mod = modifiers[i]; + best_score = score; + } + } + + if (best_score > 0) + return isl_drm_modifier_get_info(best_mod); + else + return NULL; +} + +static VkImageUsageFlags +anv_image_create_usage(const VkImageCreateInfo *pCreateInfo, + VkImageUsageFlags usage) +{ + /* Add TRANSFER_SRC usage for multisample attachment images. This is + * because we might internally use the TRANSFER_SRC layout on them for + * blorp operations associated with resolving those into other attachments + * at the end of a subpass. + * + * Without this additional usage, we compute an incorrect AUX state in + * anv_layout_to_aux_state(). + */ + if (pCreateInfo->samples > VK_SAMPLE_COUNT_1_BIT && + (usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT))) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + return usage; +} + +static VkResult MUST_CHECK +alloc_private_binding(struct anv_device *device, + struct anv_image *image, + const VkImageCreateInfo *create_info) +{ + struct anv_image_binding *binding = + &image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE]; + + if (binding->memory_range.size == 0) + return VK_SUCCESS; + + const VkImageSwapchainCreateInfoKHR *swapchain_info = + vk_find_struct_const(create_info->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR); + + if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) { + /* The image will be bound to swapchain memory. */ + return VK_SUCCESS; + } + + return anv_device_alloc_bo(device, "image-binding-private", + binding->memory_range.size, 0, 0, + &binding->address.bo); +} + +VkResult +anv_image_init(struct anv_device *device, struct anv_image *image, + const struct anv_image_create_info *create_info) +{ + const VkImageCreateInfo *pCreateInfo = create_info->vk_info; + const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info = NULL; + const struct isl_drm_modifier_info *isl_mod_info = NULL; + VkResult r; + + vk_image_init(&device->vk, &image->vk, pCreateInfo); + + image->vk.usage = anv_image_create_usage(pCreateInfo, image->vk.usage); + image->vk.stencil_usage = + anv_image_create_usage(pCreateInfo, image->vk.stencil_usage); + + if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + assert(!image->vk.wsi_legacy_scanout); + mod_explicit_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); + if (mod_explicit_info) { + isl_mod_info = isl_drm_modifier_get_info(mod_explicit_info->drmFormatModifier); + } else { + const struct VkImageDrmFormatModifierListCreateInfoEXT *mod_list_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT); + isl_mod_info = choose_drm_format_mod(device->physical, + mod_list_info->drmFormatModifierCount, + mod_list_info->pDrmFormatModifiers); + } + + assert(isl_mod_info); + assert(image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID); + image->vk.drm_format_mod = isl_mod_info->modifier; + } + + for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; ++i) { + image->bindings[i] = (struct anv_image_binding) { + .memory_range = { .binding = i }, + }; + } + + /* In case of AHardwareBuffer import, we don't know the layout yet */ + if (image->vk.external_handle_types & + VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) { + image->from_ahb = true; + return VK_SUCCESS; + } + + image->n_planes = anv_get_format_planes(image->vk.format); + + /* The Vulkan 1.2.165 glossary says: + * + * A disjoint image consists of multiple disjoint planes, and is created + * with the VK_IMAGE_CREATE_DISJOINT_BIT bit set. + */ + image->disjoint = image->n_planes > 1 && + (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT); + + const isl_tiling_flags_t isl_tiling_flags = + choose_isl_tiling_flags(device->info, create_info, isl_mod_info, + image->vk.wsi_legacy_scanout); + + const VkImageFormatListCreateInfo *fmt_list = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_FORMAT_LIST_CREATE_INFO); + + if (mod_explicit_info) { + r = add_all_surfaces_explicit_layout(device, image, fmt_list, + mod_explicit_info, isl_tiling_flags, + create_info->isl_extra_usage_flags); + } else { + r = add_all_surfaces_implicit_layout(device, image, fmt_list, 0, + isl_tiling_flags, + create_info->isl_extra_usage_flags); + } + + if (r != VK_SUCCESS) + goto fail; + + r = alloc_private_binding(device, image, pCreateInfo); + if (r != VK_SUCCESS) + goto fail; + + check_memory_bindings(device, image); + + r = check_drm_format_mod(device, image); + if (r != VK_SUCCESS) + goto fail; + + /* Once we have all the bindings, determine whether we can do non 0 fast + * clears for each plane. + */ + for (uint32_t p = 0; p < image->n_planes; p++) { + image->planes[p].can_non_zero_fast_clear = + can_fast_clear_with_non_zero_color(device->info, image, p, fmt_list); + } + + return VK_SUCCESS; + +fail: + vk_image_finish(&image->vk); + return r; +} + +void +anv_image_finish(struct anv_image *image) +{ + struct anv_device *device = + container_of(image->vk.base.device, struct anv_device, vk); + + if (image->from_gralloc) { + assert(!image->disjoint); + assert(image->n_planes == 1); + assert(image->planes[0].primary_surface.memory_range.binding == + ANV_IMAGE_MEMORY_BINDING_MAIN); + assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL); + anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo); + } + + struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo; + if (private_bo) + anv_device_release_bo(device, private_bo); + + vk_image_finish(&image->vk); +} + +static struct anv_image * +anv_swapchain_get_image(VkSwapchainKHR swapchain, + uint32_t index) +{ + VkImage image = wsi_common_get_image(swapchain, index); + return anv_image_from_handle(image); +} + +static VkResult +anv_image_init_from_create_info(struct anv_device *device, + struct anv_image *image, + const VkImageCreateInfo *pCreateInfo) +{ + const VkNativeBufferANDROID *gralloc_info = + vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID); + if (gralloc_info) + return anv_image_init_from_gralloc(device, image, pCreateInfo, + gralloc_info); + + struct anv_image_create_info create_info = { + .vk_info = pCreateInfo, + }; + + /* For dmabuf imports, configure the primary surface without support for + * compression if the modifier doesn't specify it. This helps to create + * VkImages with memory requirements that are compatible with the buffers + * apps provide. + */ + const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info = + vk_find_struct_const(pCreateInfo->pNext, + IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT); + if (mod_explicit_info && + !isl_drm_modifier_has_aux(mod_explicit_info->drmFormatModifier)) + create_info.isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT; + + return anv_image_init(device, image, &create_info); +} + +VkResult anv_CreateImage( + VkDevice _device, + const VkImageCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkImage* pImage) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + +#ifndef VK_USE_PLATFORM_ANDROID_KHR + /* Ignore swapchain creation info on Android. Since we don't have an + * implementation in Mesa, we're guaranteed to access an Android object + * incorrectly. + */ + const VkImageSwapchainCreateInfoKHR *swapchain_info = + vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR); + if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) { + return wsi_common_create_swapchain_image(&device->physical->wsi_device, + pCreateInfo, + swapchain_info->swapchain, + pImage); + } +#endif + + struct anv_image *image = + vk_object_zalloc(&device->vk, pAllocator, sizeof(*image), + VK_OBJECT_TYPE_IMAGE); + if (!image) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + VkResult result = anv_image_init_from_create_info(device, image, + pCreateInfo); + if (result != VK_SUCCESS) { + vk_object_free(&device->vk, pAllocator, image); + return result; + } + + *pImage = anv_image_to_handle(image); + + return result; +} + +void +anv_DestroyImage(VkDevice _device, VkImage _image, + const VkAllocationCallbacks *pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_image, image, _image); + + if (!image) + return; + + assert(&device->vk == image->vk.base.device); + anv_image_finish(image); + + vk_free2(&device->vk.alloc, pAllocator, image); +} + +/* We are binding AHardwareBuffer. Get a description, resolve the + * format and prepare anv_image properly. + */ +static void +resolve_ahw_image(struct anv_device *device, + struct anv_image *image, + struct anv_device_memory *mem) +{ +#if defined(ANDROID) && ANDROID_API_LEVEL >= 26 + assert(mem->ahw); + AHardwareBuffer_Desc desc; + AHardwareBuffer_describe(mem->ahw, &desc); + VkResult result; + + /* Check tiling. */ + enum isl_tiling tiling; + result = anv_device_get_bo_tiling(device, mem->bo, &tiling); + assert(result == VK_SUCCESS); + + VkImageTiling vk_tiling = + tiling == ISL_TILING_LINEAR ? VK_IMAGE_TILING_LINEAR : + VK_IMAGE_TILING_OPTIMAL; + isl_tiling_flags_t isl_tiling_flags = (1u << tiling); + + /* Check format. */ + VkFormat vk_format = vk_format_from_android(desc.format, desc.usage); + enum isl_format isl_fmt = anv_get_isl_format(device->info, + vk_format, + VK_IMAGE_ASPECT_COLOR_BIT, + vk_tiling); + assert(isl_fmt != ISL_FORMAT_UNSUPPORTED); + + /* Handle RGB(X)->RGBA fallback. */ + switch (desc.format) { + case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM: + case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM: + if (isl_format_is_rgb(isl_fmt)) + isl_fmt = isl_format_rgb_to_rgba(isl_fmt); + break; + } + + /* Now we are able to fill anv_image fields properly and create + * isl_surface for it. + */ + vk_image_set_format(&image->vk, vk_format); + image->n_planes = anv_get_format_planes(image->vk.format); + + uint32_t stride = desc.stride * + (isl_format_get_layout(isl_fmt)->bpb / 8); + + result = add_all_surfaces_implicit_layout(device, image, NULL, stride, + isl_tiling_flags, + ISL_SURF_USAGE_DISABLE_AUX_BIT); + assert(result == VK_SUCCESS); +#endif +} + +void +anv_image_get_memory_requirements(struct anv_device *device, + struct anv_image *image, + VkImageAspectFlags aspects, + VkMemoryRequirements2 *pMemoryRequirements) +{ + /* The Vulkan spec (git aaed022) says: + * + * memoryTypeBits is a bitfield and contains one bit set for every + * supported memory type for the resource. The bit `1<physical->memory.type_count) - 1; + + vk_foreach_struct(ext, pMemoryRequirements->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: { + VkMemoryDedicatedRequirements *requirements = (void *)ext; + if (image->vk.wsi_legacy_scanout || image->from_ahb) { + /* If we need to set the tiling for external consumers, we need a + * dedicated allocation. + * + * See also anv_AllocateMemory. + */ + requirements->prefersDedicatedAllocation = true; + requirements->requiresDedicatedAllocation = true; + } else { + requirements->prefersDedicatedAllocation = false; + requirements->requiresDedicatedAllocation = false; + } + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } + + /* If the image is disjoint, then we must return the memory requirements for + * the single plane specified in VkImagePlaneMemoryRequirementsInfo. If + * non-disjoint, then exactly one set of memory requirements exists for the + * whole image. + * + * This is enforced by the Valid Usage for VkImageMemoryRequirementsInfo2, + * which requires that the app provide VkImagePlaneMemoryRequirementsInfo if + * and only if the image is disjoint (that is, multi-planar format and + * VK_IMAGE_CREATE_DISJOINT_BIT). + */ + const struct anv_image_binding *binding; + if (image->disjoint) { + assert(util_bitcount(aspects) == 1); + assert(aspects & image->vk.aspects); + binding = image_aspect_to_binding(image, aspects); + } else { + assert(aspects == image->vk.aspects); + binding = &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN]; + } + + pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { + .size = binding->memory_range.size, + .alignment = binding->memory_range.alignment, + .memoryTypeBits = memory_types, + }; +} + +void anv_GetImageMemoryRequirements2( + VkDevice _device, + const VkImageMemoryRequirementsInfo2* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_image, image, pInfo->image); + + VkImageAspectFlags aspects = image->vk.aspects; + + vk_foreach_struct_const(ext, pInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: { + assert(image->disjoint); + const VkImagePlaneMemoryRequirementsInfo *plane_reqs = + (const VkImagePlaneMemoryRequirementsInfo *) ext; + aspects = plane_reqs->planeAspect; + break; + } + + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } + + anv_image_get_memory_requirements(device, image, aspects, + pMemoryRequirements); +} + +void anv_GetDeviceImageMemoryRequirementsKHR( + VkDevice _device, + const VkDeviceImageMemoryRequirements* pInfo, + VkMemoryRequirements2* pMemoryRequirements) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_image image = { 0 }; + + ASSERTED VkResult result = + anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo); + assert(result == VK_SUCCESS); + + VkImageAspectFlags aspects = + image.disjoint ? pInfo->planeAspect : image.vk.aspects; + + anv_image_get_memory_requirements(device, &image, aspects, + pMemoryRequirements); +} + +void anv_GetImageSparseMemoryRequirements( + VkDevice device, + VkImage image, + uint32_t* pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements* pSparseMemoryRequirements) +{ + *pSparseMemoryRequirementCount = 0; +} + +void anv_GetImageSparseMemoryRequirements2( + VkDevice device, + const VkImageSparseMemoryRequirementsInfo2* pInfo, + uint32_t* pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2* pSparseMemoryRequirements) +{ + *pSparseMemoryRequirementCount = 0; +} + +void anv_GetDeviceImageSparseMemoryRequirementsKHR( + VkDevice device, + const VkDeviceImageMemoryRequirements* pInfo, + uint32_t* pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2* pSparseMemoryRequirements) +{ + *pSparseMemoryRequirementCount = 0; +} + +VkResult anv_BindImageMemory2( + VkDevice _device, + uint32_t bindInfoCount, + const VkBindImageMemoryInfo* pBindInfos) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + for (uint32_t i = 0; i < bindInfoCount; i++) { + const VkBindImageMemoryInfo *bind_info = &pBindInfos[i]; + ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory); + ANV_FROM_HANDLE(anv_image, image, bind_info->image); + bool did_bind = false; + + /* Resolve will alter the image's aspects, do this first. */ + if (mem && mem->ahw) + resolve_ahw_image(device, image, mem); + + vk_foreach_struct_const(s, bind_info->pNext) { + switch (s->sType) { + case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: { + const VkBindImagePlaneMemoryInfo *plane_info = + (const VkBindImagePlaneMemoryInfo *) s; + + /* Workaround for possible spec bug. + * + * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that + * the image be disjoint (that is, multi-planar format and + * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows + * the image to be non-disjoint and requires only that the image + * have the DISJOINT flag. In this case, regardless of the value of + * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is + * the same as if VkImagePlaneMemoryRequirementsInfo were omitted. + */ + if (!image->disjoint) + break; + + struct anv_image_binding *binding = + image_aspect_to_binding(image, plane_info->planeAspect); + + binding->address = (struct anv_address) { + .bo = mem->bo, + .offset = bind_info->memoryOffset, + }; + + did_bind = true; + break; + } + case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: { + /* Ignore this struct on Android, we cannot access swapchain + * structures there. + */ +#ifndef VK_USE_PLATFORM_ANDROID_KHR + const VkBindImageMemorySwapchainInfoKHR *swapchain_info = + (const VkBindImageMemorySwapchainInfoKHR *) s; + struct anv_image *swapchain_image = + anv_swapchain_get_image(swapchain_info->swapchain, + swapchain_info->imageIndex); + assert(swapchain_image); + assert(image->vk.aspects == swapchain_image->vk.aspects); + assert(mem == NULL); + + for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j) { + assert(memory_ranges_equal(image->bindings[j].memory_range, + swapchain_image->bindings[j].memory_range)); + image->bindings[j].address = swapchain_image->bindings[j].address; + } + + /* We must bump the private binding's bo's refcount because, unlike the other + * bindings, its lifetime is not application-managed. + */ + struct anv_bo *private_bo = + image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo; + if (private_bo) + anv_bo_ref(private_bo); + + did_bind = true; +#endif + break; + } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wswitch" + case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: { + const VkNativeBufferANDROID *gralloc_info = + (const VkNativeBufferANDROID *)s; + VkResult result = anv_image_bind_from_gralloc(device, image, + gralloc_info); + if (result != VK_SUCCESS) + return result; + did_bind = true; + break; + } +#pragma GCC diagnostic pop + default: + anv_debug_ignored_stype(s->sType); + break; + } + } + + if (!did_bind) { + assert(!image->disjoint); + + image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address = + (struct anv_address) { + .bo = mem->bo, + .offset = bind_info->memoryOffset, + }; + + did_bind = true; + } + + /* On platforms that use implicit CCS, if the plane's bo lacks implicit + * CCS then disable compression on the plane. + */ + for (int p = 0; p < image->n_planes; ++p) { + enum anv_image_memory_binding binding = + image->planes[p].primary_surface.memory_range.binding; + const struct anv_bo *bo = + image->bindings[binding].address.bo; + + if (!bo || bo->has_implicit_ccs) + continue; + + if (!device->physical->has_implicit_ccs) + continue; + + if (!isl_aux_usage_has_ccs(image->planes[p].aux_usage)) + continue; + + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), + "BO lacks implicit CCS. Disabling the CCS aux usage."); + + if (image->planes[p].aux_surface.memory_range.size > 0) { + assert(image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS || + image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT); + image->planes[p].aux_usage = ISL_AUX_USAGE_HIZ; + } else { + assert(image->planes[p].aux_usage == ISL_AUX_USAGE_CCS_E || + image->planes[p].aux_usage == ISL_AUX_USAGE_STC_CCS); + image->planes[p].aux_usage = ISL_AUX_USAGE_NONE; + } + } + } + + return VK_SUCCESS; +} + +void anv_GetImageSubresourceLayout( + VkDevice device, + VkImage _image, + const VkImageSubresource* subresource, + VkSubresourceLayout* layout) +{ + ANV_FROM_HANDLE(anv_image, image, _image); + const struct anv_surface *surface; + + assert(__builtin_popcount(subresource->aspectMask) == 1); + + /* The Vulkan spec requires that aspectMask be + * VK_IMAGE_ASPECT_MEMORY_PLANE_i_BIT_EXT if tiling is + * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT. + * + * For swapchain images, the Vulkan spec says that every swapchain image has + * tiling VK_IMAGE_TILING_OPTIMAL, but we may choose + * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT internally. Vulkan doesn't allow + * vkGetImageSubresourceLayout for images with VK_IMAGE_TILING_OPTIMAL, + * therefore it's invalid for the application to call this on a swapchain + * image. The WSI code, however, knows when it has internally created + * a swapchain image with VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, + * so it _should_ correctly use VK_IMAGE_ASPECT_MEMORY_PLANE_* in that case. + * But it incorrectly uses VK_IMAGE_ASPECT_PLANE_*, so we have a temporary + * workaround. + */ + if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) { + /* TODO(chadv): Drop this workaround when WSI gets fixed. */ + uint32_t mem_plane; + switch (subresource->aspectMask) { + case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT: + case VK_IMAGE_ASPECT_PLANE_0_BIT: + mem_plane = 0; + break; + case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: + case VK_IMAGE_ASPECT_PLANE_1_BIT: + mem_plane = 1; + break; + case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: + case VK_IMAGE_ASPECT_PLANE_2_BIT: + mem_plane = 2; + break; + default: + unreachable("bad VkImageAspectFlags"); + } + + if (mem_plane == 1 && isl_drm_modifier_has_aux(image->vk.drm_format_mod)) { + assert(image->n_planes == 1); + /* If the memory binding differs between primary and aux, then the + * returned offset will be incorrect. + */ + assert(image->planes[0].aux_surface.memory_range.binding == + image->planes[0].primary_surface.memory_range.binding); + surface = &image->planes[0].aux_surface; + } else { + assert(mem_plane < image->n_planes); + surface = &image->planes[mem_plane].primary_surface; + } + } else { + const uint32_t plane = + anv_image_aspect_to_plane(image, subresource->aspectMask); + surface = &image->planes[plane].primary_surface; + } + + layout->offset = surface->memory_range.offset; + layout->rowPitch = surface->isl.row_pitch_B; + layout->depthPitch = isl_surf_get_array_pitch(&surface->isl); + layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl); + + if (subresource->mipLevel > 0 || subresource->arrayLayer > 0) { + assert(surface->isl.tiling == ISL_TILING_LINEAR); + + uint64_t offset_B; + isl_surf_get_image_offset_B_tile_sa(&surface->isl, + subresource->mipLevel, + subresource->arrayLayer, + 0 /* logical_z_offset_px */, + &offset_B, NULL, NULL); + layout->offset += offset_B; + layout->size = layout->rowPitch * anv_minify(image->vk.extent.height, + subresource->mipLevel) * + image->vk.extent.depth; + } else { + layout->size = surface->memory_range.size; + } +} + +/** + * This function returns the assumed isl_aux_state for a given VkImageLayout. + * Because Vulkan image layouts don't map directly to isl_aux_state enums, the + * returned enum is the assumed worst case. + * + * @param devinfo The device information of the Intel GPU. + * @param image The image that may contain a collection of buffers. + * @param aspect The aspect of the image to be accessed. + * @param layout The current layout of the image aspect(s). + * + * @return The primary buffer that should be used for the given layout. + */ +enum isl_aux_state ATTRIBUTE_PURE +anv_layout_to_aux_state(const struct intel_device_info * const devinfo, + const struct anv_image * const image, + const VkImageAspectFlagBits aspect, + const VkImageLayout layout) +{ + /* Validate the inputs. */ + + /* The devinfo is needed as the optimal buffer varies across generations. */ + assert(devinfo != NULL); + + /* The layout of a NULL image is not properly defined. */ + assert(image != NULL); + + /* The aspect must be exactly one of the image aspects. */ + assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects)); + + /* Determine the optimal buffer. */ + + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + /* If we don't have an aux buffer then aux state makes no sense */ + const enum isl_aux_usage aux_usage = image->planes[plane].aux_usage; + assert(aux_usage != ISL_AUX_USAGE_NONE); + + /* All images that use an auxiliary surface are required to be tiled. */ + assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR); + + /* Handle a few special cases */ + switch (layout) { + /* Invalid layouts */ + case VK_IMAGE_LAYOUT_MAX_ENUM: + unreachable("Invalid image layout."); + + /* Undefined layouts + * + * The pre-initialized layout is equivalent to the undefined layout for + * optimally-tiled images. We can only do color compression (CCS or HiZ) + * on tiled images. + */ + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_PREINITIALIZED: + return ISL_AUX_STATE_AUX_INVALID; + + case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: { + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + + enum isl_aux_state aux_state = + isl_drm_modifier_get_default_aux_state(image->vk.drm_format_mod); + + switch (aux_state) { + case ISL_AUX_STATE_AUX_INVALID: + /* The modifier does not support compression. But, if we arrived + * here, then we have enabled compression on it anyway, in which case + * we must resolve the aux surface before we release ownership to the + * presentation engine (because, having no modifier, the presentation + * engine will not be aware of the aux surface). The presentation + * engine will not access the aux surface (because it is unware of + * it), and so the aux surface will still be resolved when we + * re-acquire ownership. + * + * Therefore, at ownership transfers in either direction, there does + * exist an aux surface despite the lack of modifier and its state is + * pass-through. + */ + return ISL_AUX_STATE_PASS_THROUGH; + case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: + return ISL_AUX_STATE_COMPRESSED_NO_CLEAR; + default: + unreachable("unexpected isl_aux_state"); + } + } + + default: + break; + } + + const bool read_only = vk_image_layout_is_read_only(layout, aspect); + + const VkImageUsageFlags image_aspect_usage = + vk_image_usage(&image->vk, aspect); + const VkImageUsageFlags usage = + vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage; + + bool aux_supported = true; + bool clear_supported = isl_aux_usage_has_fast_clears(aux_usage); + + if ((usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) && !read_only) { + /* This image could be used as both an input attachment and a render + * target (depth, stencil, or color) at the same time and this can cause + * corruption. + * + * We currently only disable aux in this way for depth even though we + * disable it for color in GL. + * + * TODO: Should we be disabling this in more cases? + */ + if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT && devinfo->ver <= 9) { + aux_supported = false; + clear_supported = false; + } + } + + if (usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) { + switch (aux_usage) { + case ISL_AUX_USAGE_HIZ: + if (!anv_can_sample_with_hiz(devinfo, image)) { + aux_supported = false; + clear_supported = false; + } + break; + + case ISL_AUX_USAGE_HIZ_CCS: + aux_supported = false; + clear_supported = false; + break; + + case ISL_AUX_USAGE_HIZ_CCS_WT: + break; + + case ISL_AUX_USAGE_CCS_D: + aux_supported = false; + clear_supported = false; + break; + + case ISL_AUX_USAGE_MCS: + if (!anv_can_sample_mcs_with_clear(devinfo, image)) + clear_supported = false; + break; + + case ISL_AUX_USAGE_CCS_E: + case ISL_AUX_USAGE_STC_CCS: + break; + + default: + unreachable("Unsupported aux usage"); + } + } + + switch (aux_usage) { + case ISL_AUX_USAGE_HIZ: + case ISL_AUX_USAGE_HIZ_CCS: + case ISL_AUX_USAGE_HIZ_CCS_WT: + if (aux_supported) { + assert(clear_supported); + return ISL_AUX_STATE_COMPRESSED_CLEAR; + } else if (read_only) { + return ISL_AUX_STATE_RESOLVED; + } else { + return ISL_AUX_STATE_AUX_INVALID; + } + + case ISL_AUX_USAGE_CCS_D: + /* We only support clear in exactly one state */ + if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { + assert(aux_supported); + assert(clear_supported); + return ISL_AUX_STATE_PARTIAL_CLEAR; + } else { + return ISL_AUX_STATE_PASS_THROUGH; + } + + case ISL_AUX_USAGE_CCS_E: + if (aux_supported) { + assert(clear_supported); + return ISL_AUX_STATE_COMPRESSED_CLEAR; + } else { + return ISL_AUX_STATE_PASS_THROUGH; + } + + case ISL_AUX_USAGE_MCS: + assert(aux_supported); + if (clear_supported) { + return ISL_AUX_STATE_COMPRESSED_CLEAR; + } else { + return ISL_AUX_STATE_COMPRESSED_NO_CLEAR; + } + + case ISL_AUX_USAGE_STC_CCS: + assert(aux_supported); + assert(!clear_supported); + return ISL_AUX_STATE_COMPRESSED_NO_CLEAR; + + default: + unreachable("Unsupported aux usage"); + } +} + +/** + * This function determines the optimal buffer to use for a given + * VkImageLayout and other pieces of information needed to make that + * determination. This does not determine the optimal buffer to use + * during a resolve operation. + * + * @param devinfo The device information of the Intel GPU. + * @param image The image that may contain a collection of buffers. + * @param aspect The aspect of the image to be accessed. + * @param usage The usage which describes how the image will be accessed. + * @param layout The current layout of the image aspect(s). + * + * @return The primary buffer that should be used for the given layout. + */ +enum isl_aux_usage ATTRIBUTE_PURE +anv_layout_to_aux_usage(const struct intel_device_info * const devinfo, + const struct anv_image * const image, + const VkImageAspectFlagBits aspect, + const VkImageUsageFlagBits usage, + const VkImageLayout layout) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + /* If there is no auxiliary surface allocated, we must use the one and only + * main buffer. + */ + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return ISL_AUX_USAGE_NONE; + + enum isl_aux_state aux_state = + anv_layout_to_aux_state(devinfo, image, aspect, layout); + + switch (aux_state) { + case ISL_AUX_STATE_CLEAR: + unreachable("We never use this state"); + + case ISL_AUX_STATE_PARTIAL_CLEAR: + assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D); + assert(image->vk.samples == 1); + return ISL_AUX_USAGE_CCS_D; + + case ISL_AUX_STATE_COMPRESSED_CLEAR: + case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: + return image->planes[plane].aux_usage; + + case ISL_AUX_STATE_RESOLVED: + /* We can only use RESOLVED in read-only layouts because any write will + * either land us in AUX_INVALID or COMPRESSED_NO_CLEAR. We can do + * writes in PASS_THROUGH without destroying it so that is allowed. + */ + assert(vk_image_layout_is_read_only(layout, aspect)); + assert(util_is_power_of_two_or_zero(usage)); + if (usage == VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { + /* If we have valid HiZ data and are using the image as a read-only + * depth/stencil attachment, we should enable HiZ so that we can get + * faster depth testing. + */ + return image->planes[plane].aux_usage; + } else { + return ISL_AUX_USAGE_NONE; + } + + case ISL_AUX_STATE_PASS_THROUGH: + case ISL_AUX_STATE_AUX_INVALID: + return ISL_AUX_USAGE_NONE; + } + + unreachable("Invalid isl_aux_state"); +} + +/** + * This function returns the level of unresolved fast-clear support of the + * given image in the given VkImageLayout. + * + * @param devinfo The device information of the Intel GPU. + * @param image The image that may contain a collection of buffers. + * @param aspect The aspect of the image to be accessed. + * @param usage The usage which describes how the image will be accessed. + * @param layout The current layout of the image aspect(s). + */ +enum anv_fast_clear_type ATTRIBUTE_PURE +anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo, + const struct anv_image * const image, + const VkImageAspectFlagBits aspect, + const VkImageLayout layout) +{ + if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR)) + return ANV_FAST_CLEAR_NONE; + + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + /* If there is no auxiliary surface allocated, there are no fast-clears */ + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return ANV_FAST_CLEAR_NONE; + + /* We don't support MSAA fast-clears on Ivybridge or Bay Trail because they + * lack the MI ALU which we need to determine the predicates. + */ + if (devinfo->verx10 == 70 && image->vk.samples > 1) + return ANV_FAST_CLEAR_NONE; + + enum isl_aux_state aux_state = + anv_layout_to_aux_state(devinfo, image, aspect, layout); + + switch (aux_state) { + case ISL_AUX_STATE_CLEAR: + unreachable("We never use this state"); + + case ISL_AUX_STATE_PARTIAL_CLEAR: + case ISL_AUX_STATE_COMPRESSED_CLEAR: + if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) { + return ANV_FAST_CLEAR_DEFAULT_VALUE; + } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { + /* The image might not support non zero fast clears when mutable. */ + if (!image->planes[plane].can_non_zero_fast_clear) + return ANV_FAST_CLEAR_DEFAULT_VALUE; + + /* When we're in a render pass we have the clear color data from the + * VkRenderPassBeginInfo and we can use arbitrary clear colors. They + * must get partially resolved before we leave the render pass. + */ + return ANV_FAST_CLEAR_ANY; + } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS || + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { + if (devinfo->ver >= 11) { + /* The image might not support non zero fast clears when mutable. */ + if (!image->planes[plane].can_non_zero_fast_clear) + return ANV_FAST_CLEAR_DEFAULT_VALUE; + + /* On ICL and later, the sampler hardware uses a copy of the clear + * value that is encoded as a pixel value. Therefore, we can use + * any clear color we like for sampling. + */ + return ANV_FAST_CLEAR_ANY; + } else { + /* If the image has MCS or CCS_E enabled all the time then we can + * use fast-clear as long as the clear color is the default value + * of zero since this is the default value we program into every + * surface state used for texturing. + */ + return ANV_FAST_CLEAR_DEFAULT_VALUE; + } + } else { + return ANV_FAST_CLEAR_NONE; + } + + case ISL_AUX_STATE_COMPRESSED_NO_CLEAR: + case ISL_AUX_STATE_RESOLVED: + case ISL_AUX_STATE_PASS_THROUGH: + case ISL_AUX_STATE_AUX_INVALID: + return ANV_FAST_CLEAR_NONE; + } + + unreachable("Invalid isl_aux_state"); +} + + +static struct anv_state +alloc_surface_state(struct anv_device *device) +{ + return anv_state_pool_alloc(&device->surface_state_pool, 64, 64); +} + +static enum isl_channel_select +remap_swizzle(VkComponentSwizzle swizzle, + struct isl_swizzle format_swizzle) +{ + switch (swizzle) { + case VK_COMPONENT_SWIZZLE_ZERO: return ISL_CHANNEL_SELECT_ZERO; + case VK_COMPONENT_SWIZZLE_ONE: return ISL_CHANNEL_SELECT_ONE; + case VK_COMPONENT_SWIZZLE_R: return format_swizzle.r; + case VK_COMPONENT_SWIZZLE_G: return format_swizzle.g; + case VK_COMPONENT_SWIZZLE_B: return format_swizzle.b; + case VK_COMPONENT_SWIZZLE_A: return format_swizzle.a; + default: + unreachable("Invalid swizzle"); + } +} + +void +anv_image_fill_surface_state(struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + const struct isl_view *view_in, + isl_surf_usage_flags_t view_usage, + enum isl_aux_usage aux_usage, + const union isl_color_value *clear_color, + enum anv_image_view_state_flags flags, + struct anv_surface_state *state_inout, + struct brw_image_param *image_param_out) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + const struct anv_surface *surface = &image->planes[plane].primary_surface, + *aux_surface = &image->planes[plane].aux_surface; + + struct isl_view view = *view_in; + view.usage |= view_usage; + + /* For texturing with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL from a + * compressed surface with a shadow surface, we use the shadow instead of + * the primary surface. The shadow surface will be tiled, unlike the main + * surface, so it should get significantly better performance. + */ + if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && + isl_format_is_compressed(view.format) && + (flags & ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL)) { + assert(isl_format_is_compressed(surface->isl.format)); + assert(surface->isl.tiling == ISL_TILING_LINEAR); + assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR); + surface = &image->planes[plane].shadow_surface; + } + + /* For texturing from stencil on gfx7, we have to sample from a shadow + * surface because we don't support W-tiling in the sampler. + */ + if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && + aspect == VK_IMAGE_ASPECT_STENCIL_BIT) { + assert(device->info->ver == 7); + assert(view_usage & ISL_SURF_USAGE_TEXTURE_BIT); + surface = &image->planes[plane].shadow_surface; + } + + if (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT) + view.swizzle = anv_swizzle_for_render(view.swizzle); + + /* On Ivy Bridge and Bay Trail we do the swizzle in the shader */ + if (device->info->verx10 == 70) + view.swizzle = ISL_SWIZZLE_IDENTITY; + + /* If this is a HiZ buffer we can sample from with a programmable clear + * value (SKL+), define the clear value to the optimal constant. + */ + union isl_color_value default_clear_color = { .u32 = { 0, } }; + if (device->info->ver >= 9 && aspect == VK_IMAGE_ASPECT_DEPTH_BIT) + default_clear_color.f32[0] = ANV_HZ_FC_VAL; + if (!clear_color) + clear_color = &default_clear_color; + + const struct anv_address address = + anv_image_address(image, &surface->memory_range); + + if (view_usage == ISL_SURF_USAGE_STORAGE_BIT && + (flags & ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED) && + !isl_has_matching_typed_storage_image_format(device->info, + view.format)) { + /* In this case, we are a writeable storage buffer which needs to be + * lowered to linear. All tiling and offset calculations will be done in + * the shader. + */ + assert(aux_usage == ISL_AUX_USAGE_NONE); + isl_buffer_fill_state(&device->isl_dev, state_inout->state.map, + .address = anv_address_physical(address), + .size_B = surface->isl.size_B, + .format = ISL_FORMAT_RAW, + .swizzle = ISL_SWIZZLE_IDENTITY, + .stride_B = 1, + .mocs = anv_mocs(device, address.bo, view_usage)); + state_inout->address = address, + state_inout->aux_address = ANV_NULL_ADDRESS; + state_inout->clear_address = ANV_NULL_ADDRESS; + } else { + if (view_usage == ISL_SURF_USAGE_STORAGE_BIT && + (flags & ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED)) { + /* Typed surface reads support a very limited subset of the shader + * image formats. Translate it into the closest format the hardware + * supports. + */ + enum isl_format lower_format = + isl_lower_storage_image_format(device->info, view.format); + if (aux_usage != ISL_AUX_USAGE_NONE) { + assert(device->info->verx10 >= 125); + assert(aux_usage == ISL_AUX_USAGE_CCS_E); + assert(isl_formats_are_ccs_e_compatible(device->info, + view.format, + lower_format)); + } + + /* If we lower the format, we should ensure either they both match in + * bits per channel or that there is no swizzle, because we can't use + * the swizzle for a different bit pattern. + */ + assert(isl_formats_have_same_bits_per_channel(lower_format, + view.format) || + isl_swizzle_is_identity_for_format(view.format, view.swizzle)); + + view.format = lower_format; + } + + const struct isl_surf *isl_surf = &surface->isl; + + struct isl_surf tmp_surf; + uint64_t offset_B = 0; + uint32_t tile_x_sa = 0, tile_y_sa = 0; + if (isl_format_is_compressed(surface->isl.format) && + !isl_format_is_compressed(view.format)) { + /* We're creating an uncompressed view of a compressed surface. This + * is allowed but only for a single level/layer. + */ + assert(surface->isl.samples == 1); + assert(view.levels == 1); + assert(view.array_len == 1); + + ASSERTED bool ok = + isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view, + &tmp_surf, &view, + &offset_B, &tile_x_sa, &tile_y_sa); + assert(ok); + isl_surf = &tmp_surf; + + if (device->info->ver <= 8) { + assert(surface->isl.tiling == ISL_TILING_LINEAR); + assert(tile_x_sa == 0); + assert(tile_y_sa == 0); + } + } + + state_inout->address = anv_address_add(address, offset_B); + + struct anv_address aux_address = ANV_NULL_ADDRESS; + if (aux_usage != ISL_AUX_USAGE_NONE) + aux_address = anv_image_address(image, &aux_surface->memory_range); + state_inout->aux_address = aux_address; + + struct anv_address clear_address = ANV_NULL_ADDRESS; + if (device->info->ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) { + clear_address = anv_image_get_clear_color_addr(device, image, aspect); + } + state_inout->clear_address = clear_address; + + isl_surf_fill_state(&device->isl_dev, state_inout->state.map, + .surf = isl_surf, + .view = &view, + .address = anv_address_physical(state_inout->address), + .clear_color = *clear_color, + .aux_surf = &aux_surface->isl, + .aux_usage = aux_usage, + .aux_address = anv_address_physical(aux_address), + .clear_address = anv_address_physical(clear_address), + .use_clear_address = !anv_address_is_null(clear_address), + .mocs = anv_mocs(device, state_inout->address.bo, + view_usage), + .x_offset_sa = tile_x_sa, + .y_offset_sa = tile_y_sa); + + /* With the exception of gfx8, the bottom 12 bits of the MCS base address + * are used to store other information. This should be ok, however, + * because the surface buffer addresses are always 4K page aligned. + */ + if (!anv_address_is_null(aux_address)) { + uint32_t *aux_addr_dw = state_inout->state.map + + device->isl_dev.ss.aux_addr_offset; + assert((aux_address.offset & 0xfff) == 0); + state_inout->aux_address.offset |= *aux_addr_dw & 0xfff; + } + + if (device->info->ver >= 10 && clear_address.bo) { + uint32_t *clear_addr_dw = state_inout->state.map + + device->isl_dev.ss.clear_color_state_offset; + assert((clear_address.offset & 0x3f) == 0); + state_inout->clear_address.offset |= *clear_addr_dw & 0x3f; + } + } + + if (image_param_out) { + assert(view_usage == ISL_SURF_USAGE_STORAGE_BIT); + isl_surf_fill_image_param(&device->isl_dev, image_param_out, + &surface->isl, &view); + } +} + +static uint32_t +anv_image_aspect_get_planes(VkImageAspectFlags aspect_mask) +{ + anv_assert_valid_aspect_set(aspect_mask); + return util_bitcount(aspect_mask); +} + +VkResult +anv_CreateImageView(VkDevice _device, + const VkImageViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkImageView *pView) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image); + struct anv_image_view *iview; + + iview = vk_image_view_create(&device->vk, false, pCreateInfo, + pAllocator, sizeof(*iview)); + if (iview == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + iview->image = image; + iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects); + + /* Check if a conversion info was passed. */ + const struct anv_format *conv_format = NULL; + const VkSamplerYcbcrConversionInfo *conv_info = + vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO); + +#ifdef ANDROID + /* If image has an external format, the pNext chain must contain an + * instance of VKSamplerYcbcrConversionInfo with a conversion object + * created with the same external format as image." + */ + assert(!image->vk.android_external_format || conv_info); +#endif + + if (conv_info) { + ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, conv_info->conversion); + conv_format = conversion->format; + } + +#ifdef ANDROID + /* "If image has an external format, format must be VK_FORMAT_UNDEFINED." */ + assert(!image->vk.android_external_format || + pCreateInfo->format == VK_FORMAT_UNDEFINED); +#endif + + /* Format is undefined, this can happen when using external formats. Set + * view format from the passed conversion info. + */ + if (iview->vk.view_format == VK_FORMAT_UNDEFINED && conv_format) + iview->vk.view_format = conv_format->vk_format; + + /* Now go through the underlying image selected planes and map them to + * planes in the image view. + */ + anv_foreach_image_aspect_bit(iaspect_bit, image, iview->vk.aspects) { + const uint32_t iplane = + anv_aspect_to_plane(image->vk.aspects, 1UL << iaspect_bit); + const uint32_t vplane = + anv_aspect_to_plane(iview->vk.aspects, 1UL << iaspect_bit); + struct anv_format_plane format; + format = anv_get_format_plane(device->info, iview->vk.view_format, + vplane, image->vk.tiling); + + iview->planes[vplane].image_plane = iplane; + + iview->planes[vplane].isl = (struct isl_view) { + .format = format.isl_format, + .base_level = iview->vk.base_mip_level, + .levels = iview->vk.level_count, + .base_array_layer = iview->vk.base_array_layer, + .array_len = iview->vk.layer_count, + .min_lod_clamp = iview->vk.min_lod, + .swizzle = { + .r = remap_swizzle(iview->vk.swizzle.r, format.swizzle), + .g = remap_swizzle(iview->vk.swizzle.g, format.swizzle), + .b = remap_swizzle(iview->vk.swizzle.b, format.swizzle), + .a = remap_swizzle(iview->vk.swizzle.a, format.swizzle), + }, + }; + + if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D) { + iview->planes[vplane].isl.base_array_layer = 0; + iview->planes[vplane].isl.array_len = iview->vk.extent.depth; + } + + if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE || + pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) { + iview->planes[vplane].isl.usage = ISL_SURF_USAGE_CUBE_BIT; + } else { + iview->planes[vplane].isl.usage = 0; + } + + if (iview->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) { + iview->planes[vplane].optimal_sampler_surface_state.state = alloc_surface_state(device); + iview->planes[vplane].general_sampler_surface_state.state = alloc_surface_state(device); + + enum isl_aux_usage general_aux_usage = + anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit, + VK_IMAGE_USAGE_SAMPLED_BIT, + VK_IMAGE_LAYOUT_GENERAL); + enum isl_aux_usage optimal_aux_usage = + anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit, + VK_IMAGE_USAGE_SAMPLED_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + + anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit, + &iview->planes[vplane].isl, + ISL_SURF_USAGE_TEXTURE_BIT, + optimal_aux_usage, NULL, + ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL, + &iview->planes[vplane].optimal_sampler_surface_state, + NULL); + + anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit, + &iview->planes[vplane].isl, + ISL_SURF_USAGE_TEXTURE_BIT, + general_aux_usage, NULL, + 0, + &iview->planes[vplane].general_sampler_surface_state, + NULL); + } + + /* NOTE: This one needs to go last since it may stomp isl_view.format */ + if (iview->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) { + enum isl_aux_usage general_aux_usage = + anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit, + VK_IMAGE_USAGE_STORAGE_BIT, + VK_IMAGE_LAYOUT_GENERAL); + iview->planes[vplane].storage_surface_state.state = alloc_surface_state(device); + anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit, + &iview->planes[vplane].isl, + ISL_SURF_USAGE_STORAGE_BIT, + general_aux_usage, NULL, + 0, + &iview->planes[vplane].storage_surface_state, + NULL); + + if (isl_is_storage_image_format(format.isl_format)) { + iview->planes[vplane].lowered_storage_surface_state.state = + alloc_surface_state(device); + + anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit, + &iview->planes[vplane].isl, + ISL_SURF_USAGE_STORAGE_BIT, + general_aux_usage, NULL, + ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED, + &iview->planes[vplane].lowered_storage_surface_state, + device->info->ver >= 9 ? NULL : + &iview->planes[vplane].lowered_storage_image_param); + } else { + /* In this case, we support the format but, because there's no + * SPIR-V format specifier corresponding to it, we only support it + * if the hardware can do it natively. This is possible for some + * reads but for most writes. Instead of hanging if someone gets + * it wrong, we give them a NULL descriptor. + */ + assert(isl_format_supports_typed_writes(device->info, + format.isl_format)); + iview->planes[vplane].lowered_storage_surface_state.state = + device->null_surface_state; + } + } + } + + *pView = anv_image_view_to_handle(iview); + + return VK_SUCCESS; +} + +void +anv_DestroyImageView(VkDevice _device, VkImageView _iview, + const VkAllocationCallbacks *pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_image_view, iview, _iview); + + if (!iview) + return; + + for (uint32_t plane = 0; plane < iview->n_planes; plane++) { + /* Check offset instead of alloc_size because this they might be + * device->null_surface_state which always has offset == 0. We don't + * own that one so we don't want to accidentally free it. + */ + if (iview->planes[plane].optimal_sampler_surface_state.state.offset) { + anv_state_pool_free(&device->surface_state_pool, + iview->planes[plane].optimal_sampler_surface_state.state); + } + + if (iview->planes[plane].general_sampler_surface_state.state.offset) { + anv_state_pool_free(&device->surface_state_pool, + iview->planes[plane].general_sampler_surface_state.state); + } + + if (iview->planes[plane].storage_surface_state.state.offset) { + anv_state_pool_free(&device->surface_state_pool, + iview->planes[plane].storage_surface_state.state); + } + + if (iview->planes[plane].lowered_storage_surface_state.state.offset) { + anv_state_pool_free(&device->surface_state_pool, + iview->planes[plane].lowered_storage_surface_state.state); + } + } + + vk_image_view_destroy(&device->vk, pAllocator, &iview->vk); +} + + +VkResult +anv_CreateBufferView(VkDevice _device, + const VkBufferViewCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkBufferView *pView) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer); + struct anv_buffer_view *view; + + view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view), + VK_OBJECT_TYPE_BUFFER_VIEW); + if (!view) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct anv_format_plane format; + format = anv_get_format_plane(device->info, pCreateInfo->format, + 0, VK_IMAGE_TILING_LINEAR); + + const uint32_t format_bs = isl_format_get_layout(format.isl_format)->bpb / 8; + view->range = vk_buffer_range(&buffer->vk, pCreateInfo->offset, + pCreateInfo->range); + view->range = align_down_npot_u32(view->range, format_bs); + + view->address = anv_address_add(buffer->address, pCreateInfo->offset); + + if (buffer->vk.usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) { + view->surface_state = alloc_surface_state(device); + + anv_fill_buffer_surface_state(device, view->surface_state, + format.isl_format, format.swizzle, + ISL_SURF_USAGE_TEXTURE_BIT, + view->address, view->range, format_bs); + } else { + view->surface_state = (struct anv_state){ 0 }; + } + + if (buffer->vk.usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) { + view->storage_surface_state = alloc_surface_state(device); + view->lowered_storage_surface_state = alloc_surface_state(device); + + anv_fill_buffer_surface_state(device, view->storage_surface_state, + format.isl_format, format.swizzle, + ISL_SURF_USAGE_STORAGE_BIT, + view->address, view->range, format_bs); + + enum isl_format lowered_format = + isl_has_matching_typed_storage_image_format(device->info, + format.isl_format) ? + isl_lower_storage_image_format(device->info, format.isl_format) : + ISL_FORMAT_RAW; + + /* If we lower the format, we should ensure either they both match in + * bits per channel or that there is no swizzle because we can't use + * the swizzle for a different bit pattern. + */ + assert(isl_formats_have_same_bits_per_channel(lowered_format, + format.isl_format) || + isl_swizzle_is_identity(format.swizzle)); + + anv_fill_buffer_surface_state(device, view->lowered_storage_surface_state, + lowered_format, format.swizzle, + ISL_SURF_USAGE_STORAGE_BIT, + view->address, view->range, + (lowered_format == ISL_FORMAT_RAW ? 1 : + isl_format_get_layout(lowered_format)->bpb / 8)); + + isl_buffer_fill_image_param(&device->isl_dev, + &view->lowered_storage_image_param, + format.isl_format, view->range); + } else { + view->storage_surface_state = (struct anv_state){ 0 }; + view->lowered_storage_surface_state = (struct anv_state){ 0 }; + } + + *pView = anv_buffer_view_to_handle(view); + + return VK_SUCCESS; +} + +void +anv_DestroyBufferView(VkDevice _device, VkBufferView bufferView, + const VkAllocationCallbacks *pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_buffer_view, view, bufferView); + + if (!view) + return; + + if (view->surface_state.alloc_size > 0) + anv_state_pool_free(&device->surface_state_pool, + view->surface_state); + + if (view->storage_surface_state.alloc_size > 0) + anv_state_pool_free(&device->surface_state_pool, + view->storage_surface_state); + + if (view->lowered_storage_surface_state.alloc_size > 0) + anv_state_pool_free(&device->surface_state_pool, + view->lowered_storage_surface_state); + + vk_object_free(&device->vk, pAllocator, view); +} diff --git a/src/intel/vulkan_hasvk/anv_measure.c b/src/intel/vulkan_hasvk/anv_measure.c new file mode 100644 index 00000000000..f1e4d0eeba9 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_measure.c @@ -0,0 +1,516 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "anv_measure.h" + +#include +#include +#include + +#include "common/intel_measure.h" +#include "util/debug.h" + +struct anv_measure_batch { + struct anv_bo *bo; + struct intel_measure_batch base; +}; + +void +anv_measure_device_init(struct anv_physical_device *device) +{ + switch (device->info.verx10) { + case 125: + device->cmd_emit_timestamp = &gfx125_cmd_emit_timestamp; + break; + case 120: + device->cmd_emit_timestamp = &gfx12_cmd_emit_timestamp; + break; + case 110: + device->cmd_emit_timestamp = &gfx11_cmd_emit_timestamp; + break; + case 90: + device->cmd_emit_timestamp = &gfx9_cmd_emit_timestamp; + break; + case 80: + device->cmd_emit_timestamp = &gfx8_cmd_emit_timestamp; + break; + case 75: + device->cmd_emit_timestamp = &gfx75_cmd_emit_timestamp; + break; + case 70: + device->cmd_emit_timestamp = &gfx7_cmd_emit_timestamp; + break; + default: + assert(false); + } + + /* initialise list of measure structures that await rendering */ + struct intel_measure_device *measure_device = &device->measure_device; + intel_measure_init(measure_device); + struct intel_measure_config *config = measure_device->config; + if (config == NULL) + return; + + /* the final member of intel_measure_ringbuffer is a zero-length array of + * intel_measure_buffered_result objects. Allocate additional space for + * the buffered objects based on the run-time configurable buffer_size + */ + const size_t rb_bytes = sizeof(struct intel_measure_ringbuffer) + + config->buffer_size * sizeof(struct intel_measure_buffered_result); + struct intel_measure_ringbuffer * rb = + vk_zalloc(&device->instance->vk.alloc, + rb_bytes, 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + measure_device->ringbuffer = rb; +} + +static struct intel_measure_config* +config_from_command_buffer(struct anv_cmd_buffer *cmd_buffer) +{ + return cmd_buffer->device->physical->measure_device.config; +} + +void +anv_measure_init(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_device *device = cmd_buffer->device; + + if (!config || !config->enabled) { + cmd_buffer->measure = NULL; + return; + } + + /* the final member of anv_measure is a zero-length array of + * intel_measure_snapshot objects. Create additional space for the + * snapshot objects based on the run-time configurable batch_size + */ + const size_t batch_bytes = sizeof(struct anv_measure_batch) + + config->batch_size * sizeof(struct intel_measure_snapshot); + struct anv_measure_batch * measure = + vk_alloc(&cmd_buffer->vk.pool->alloc, + batch_bytes, 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + memset(measure, 0, batch_bytes); + ASSERTED VkResult result = + anv_device_alloc_bo(device, "measure data", + config->batch_size * sizeof(uint64_t), + ANV_BO_ALLOC_MAPPED, + 0, + (struct anv_bo**)&measure->bo); + measure->base.timestamps = measure->bo->map; + assert(result == VK_SUCCESS); + + cmd_buffer->measure = measure; +} + +static void +anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer, + enum intel_measure_snapshot_type type, + const char *event_name, + uint32_t count) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_measure_batch *measure = cmd_buffer->measure; + struct anv_physical_device *device = cmd_buffer->device->physical; + struct intel_measure_device *measure_device = &device->measure_device; + + const unsigned device_frame = measure_device->frame; + + /* if the command buffer is not associated with a frame, associate it with + * the most recent acquired frame + */ + if (measure->base.frame == 0) + measure->base.frame = device_frame; + +// uintptr_t framebuffer = (uintptr_t)cmd_buffer->state.framebuffer; +// +// if (!measure->base.framebuffer && +// cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) +// /* secondary command buffer inherited the framebuffer from the primary */ +// measure->base.framebuffer = framebuffer; +// +// /* verify framebuffer has been properly tracked */ +// assert(type == INTEL_SNAPSHOT_END || +// framebuffer == measure->base.framebuffer || +// framebuffer == 0 ); /* compute has no framebuffer */ + + unsigned index = measure->base.index++; + + (*device->cmd_emit_timestamp)(batch, cmd_buffer->device, + (struct anv_address) { + .bo = measure->bo, + .offset = index * sizeof(uint64_t) }, + true /* end_of_pipe */); + + if (event_name == NULL) + event_name = intel_measure_snapshot_string(type); + + struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); + memset(snapshot, 0, sizeof(*snapshot)); + snapshot->type = type; + snapshot->count = (unsigned) count; + snapshot->event_count = measure->base.event_count; + snapshot->event_name = event_name; +// snapshot->framebuffer = framebuffer; + + if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.pipeline) { + snapshot->cs = (uintptr_t) cmd_buffer->state.compute.pipeline->cs; + } else if (cmd_buffer->state.gfx.pipeline) { + const struct anv_graphics_pipeline *pipeline = + cmd_buffer->state.gfx.pipeline; + snapshot->vs = (uintptr_t) pipeline->shaders[MESA_SHADER_VERTEX]; + snapshot->tcs = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_CTRL]; + snapshot->tes = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_EVAL]; + snapshot->gs = (uintptr_t) pipeline->shaders[MESA_SHADER_GEOMETRY]; + snapshot->fs = (uintptr_t) pipeline->shaders[MESA_SHADER_FRAGMENT]; + } +} + +static void +anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer, + uint32_t event_count) +{ + struct anv_batch *batch = &cmd_buffer->batch; + struct anv_measure_batch *measure = cmd_buffer->measure; + struct anv_physical_device *device = cmd_buffer->device->physical; + + unsigned index = measure->base.index++; + assert(index % 2 == 1); + + (*device->cmd_emit_timestamp)(batch, cmd_buffer->device, + (struct anv_address) { + .bo = measure->bo, + .offset = index * sizeof(uint64_t) }, + true /* end_of_pipe */); + + struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); + memset(snapshot, 0, sizeof(*snapshot)); + snapshot->type = INTEL_SNAPSHOT_END; + snapshot->event_count = event_count; +} + +static bool +state_changed(struct anv_cmd_buffer *cmd_buffer, + enum intel_measure_snapshot_type type) +{ + uintptr_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0; + + if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) + /* can't record timestamps in this mode */ + return false; + + if (type == INTEL_SNAPSHOT_COMPUTE) { + const struct anv_compute_pipeline *cs_pipe = + cmd_buffer->state.compute.pipeline; + assert(cs_pipe); + cs = (uintptr_t)cs_pipe->cs; + } else if (type == INTEL_SNAPSHOT_DRAW) { + const struct anv_graphics_pipeline *gfx = cmd_buffer->state.gfx.pipeline; + assert(gfx); + vs = (uintptr_t) gfx->shaders[MESA_SHADER_VERTEX]; + tcs = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_CTRL]; + tes = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_EVAL]; + gs = (uintptr_t) gfx->shaders[MESA_SHADER_GEOMETRY]; + fs = (uintptr_t) gfx->shaders[MESA_SHADER_FRAGMENT]; + } + /* else blorp, all programs NULL */ + + return intel_measure_state_changed(&cmd_buffer->measure->base, + vs, tcs, tes, gs, fs, cs); +} + +void +_anv_measure_snapshot(struct anv_cmd_buffer *cmd_buffer, + enum intel_measure_snapshot_type type, + const char *event_name, + uint32_t count) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_measure_batch *measure = cmd_buffer->measure; + + assert(config); + if (measure == NULL) + return; + + assert(type != INTEL_SNAPSHOT_END); + if (!state_changed(cmd_buffer, type)) { + /* filter out this event */ + return; + } + + /* increment event count */ + ++measure->base.event_count; + if (measure->base.event_count == 1 || + measure->base.event_count == config->event_interval + 1) { + /* the first event of an interval */ + + if (measure->base.index % 2) { + /* end the previous event */ + anv_measure_end_snapshot(cmd_buffer, measure->base.event_count - 1); + } + measure->base.event_count = 1; + + if (measure->base.index == config->batch_size) { + /* Snapshot buffer is full. The batch must be flushed before + * additional snapshots can be taken. + */ + static bool warned = false; + if (unlikely(!warned)) { + fprintf(config->file, + "WARNING: batch size exceeds INTEL_MEASURE limit: %d. " + "Data has been dropped. " + "Increase setting with INTEL_MEASURE=batch_size={count}\n", + config->batch_size); + } + + warned = true; + return; + } + + anv_measure_start_snapshot(cmd_buffer, type, event_name, count); + } +} + +/** + * Called when a command buffer is reset. Re-initializes existing anv_measure + * data structures. + */ +void +anv_measure_reset(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_device *device = cmd_buffer->device; + struct anv_measure_batch *measure = cmd_buffer->measure; + + if (!config) + return; + + if (!config->enabled) { + cmd_buffer->measure = NULL; + return; + } + + if (!measure) { + /* Capture has recently been enabled. Instead of resetting, a new data + * structure must be allocated and initialized. + */ + return anv_measure_init(cmd_buffer); + } + + /* it is possible that the command buffer contains snapshots that have not + * yet been processed + */ + intel_measure_gather(&device->physical->measure_device, + device->info); + + assert(cmd_buffer->device != NULL); + + measure->base.index = 0; +// measure->base.framebuffer = 0; + measure->base.frame = 0; + measure->base.event_count = 0; + list_inithead(&measure->base.link); +} + +void +anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_measure_batch *measure = cmd_buffer->measure; + struct anv_device *device = cmd_buffer->device; + struct anv_physical_device *physical = device->physical; + + if (!config) + return; + if (measure == NULL) + return; + + /* it is possible that the command buffer contains snapshots that have not + * yet been processed + */ + intel_measure_gather(&physical->measure_device, &physical->info); + + anv_device_release_bo(device, measure->bo); + vk_free(&cmd_buffer->vk.pool->alloc, measure); + cmd_buffer->measure = NULL; +} + +static struct intel_measure_config* +config_from_device(struct anv_device *device) +{ + return device->physical->measure_device.config; +} + +void +anv_measure_device_destroy(struct anv_physical_device *device) +{ + struct intel_measure_device *measure_device = &device->measure_device; + struct intel_measure_config *config = measure_device->config; + + if (!config) + return; + + if (measure_device->ringbuffer != NULL) { + vk_free(&device->instance->vk.alloc, measure_device->ringbuffer); + measure_device->ringbuffer = NULL; + } +} + +/** + * Hook for command buffer submission. + */ +void +_anv_measure_submit(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_measure_batch *measure = cmd_buffer->measure; + struct intel_measure_device *measure_device = &cmd_buffer->device->physical->measure_device; + + if (!config) + return; + if (measure == NULL) + return; + + struct intel_measure_batch *base = &measure->base; + if (base->index == 0) + /* no snapshots were started */ + return; + + /* finalize snapshots and enqueue them */ + static unsigned cmd_buffer_count = 0; + base->batch_count = p_atomic_inc_return(&cmd_buffer_count); + + if (base->index %2 == 1) { + anv_measure_end_snapshot(cmd_buffer, base->event_count); + base->event_count = 0; + } + + /* Mark the final timestamp as 'not completed'. This marker will be used + * to verify that rendering is complete. + */ + base->timestamps[base->index - 1] = 0; + + /* add to the list of submitted snapshots */ + pthread_mutex_lock(&measure_device->mutex); + list_addtail(&measure->base.link, &measure_device->queued_snapshots); + pthread_mutex_unlock(&measure_device->mutex); +} + +/** + * Hook for the start of a frame. + */ +void +_anv_measure_acquire(struct anv_device *device) +{ + struct intel_measure_config *config = config_from_device(device); + struct intel_measure_device *measure_device = &device->physical->measure_device; + + if (!config) + return; + if (measure_device == NULL) + return; + + intel_measure_frame_transition(p_atomic_inc_return(&measure_device->frame)); + + /* iterate the queued snapshots and publish those that finished */ + intel_measure_gather(measure_device, &device->physical->info); +} + +void +_anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_measure_batch *measure = cmd_buffer->measure; + + if (!config) + return; + if (measure == NULL) + return; + if (measure->base.index % 2 == 0) + return; + + anv_measure_end_snapshot(cmd_buffer, measure->base.event_count); + measure->base.event_count = 0; +} + +void +_anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer) +{ + struct intel_measure_config *config = config_from_command_buffer(cmd_buffer); + struct anv_measure_batch *measure = cmd_buffer->measure; + + if (!config) + return; + if (measure == NULL) + return; + +// if (measure->base.framebuffer == (uintptr_t) cmd_buffer->state.framebuffer) +// /* no change */ +// return; + + bool filtering = (config->flags & (INTEL_MEASURE_RENDERPASS | + INTEL_MEASURE_SHADER)); + if (filtering && measure->base.index % 2 == 1) { + /* snapshot for previous renderpass was not ended */ + anv_measure_end_snapshot(cmd_buffer, + measure->base.event_count); + measure->base.event_count = 0; + } + +// measure->base.framebuffer = (uintptr_t) cmd_buffer->state.framebuffer; +} + +void +_anv_measure_add_secondary(struct anv_cmd_buffer *primary, + struct anv_cmd_buffer *secondary) +{ + struct intel_measure_config *config = config_from_command_buffer(primary); + struct anv_measure_batch *measure = primary->measure; + if (!config) + return; + if (measure == NULL) + return; + if (config->flags & (INTEL_MEASURE_BATCH | INTEL_MEASURE_FRAME)) + /* secondary timing will be contained within the primary */ + return; + if (secondary->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) { + static bool warned = false; + if (unlikely(!warned)) { + fprintf(config->file, + "WARNING: INTEL_MEASURE cannot capture timings of commands " + "in secondary command buffers with " + "VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT set.\n"); + } + return; + } + + if (measure->base.index % 2 == 1) + anv_measure_end_snapshot(primary, measure->base.event_count); + + struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[measure->base.index]); + _anv_measure_snapshot(primary, INTEL_SNAPSHOT_SECONDARY_BATCH, NULL, 0); + + snapshot->secondary = &secondary->measure->base; +} diff --git a/src/intel/vulkan_hasvk/anv_measure.h b/src/intel/vulkan_hasvk/anv_measure.h new file mode 100644 index 00000000000..a058a5ac51e --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_measure.h @@ -0,0 +1,82 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ANV_MEASURE_H +#define ANV_MEASURE_H + +#include "anv_private.h" +#include "common/intel_measure.h" + +void anv_measure_device_init(struct anv_physical_device *device); +void anv_measure_device_destroy(struct anv_physical_device *device); + +void anv_measure_init(struct anv_cmd_buffer *cmd_buffer); +void anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer); +void anv_measure_reset(struct anv_cmd_buffer *cmd_buffer); + +void _anv_measure_snapshot(struct anv_cmd_buffer *cmd_buffer, + enum intel_measure_snapshot_type type, + const char *event_name, + uint32_t count); + +/* ends snapshots before command buffer submission */ +void _anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer); + +/* when measuring render passes, inserts a timestamp */ +void _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer); + +/* tracks frame progression */ +void _anv_measure_acquire(struct anv_device *device); + +/* should be combined with endcommandbuffer */ +void _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer); + +void +_anv_measure_add_secondary(struct anv_cmd_buffer *primary, + struct anv_cmd_buffer *secondary); + +#define anv_measure_acquire(device) \ + if (unlikely(device->physical->measure_device.config)) \ + _anv_measure_acquire(device) + +#define anv_measure_snapshot(cmd_buffer, type, event_name, count) \ + if (unlikely(cmd_buffer->measure)) \ + _anv_measure_snapshot(cmd_buffer, type, event_name, count) + +#define anv_measure_endcommandbuffer(cmd_buffer) \ + if (unlikely(cmd_buffer->measure)) \ + _anv_measure_endcommandbuffer(cmd_buffer) + +#define anv_measure_beginrenderpass(cmd_buffer) \ + if (unlikely(cmd_buffer->measure)) \ + _anv_measure_beginrenderpass(cmd_buffer) + +#define anv_measure_submit(cmd_buffer) \ + if (unlikely(cmd_buffer->measure)) \ + _anv_measure_submit(cmd_buffer) + +#define anv_measure_add_secondary(primary, secondary) \ + if (unlikely(primary->measure)) \ + _anv_measure_add_secondary(primary, secondary) + +#endif /* ANV_MEASURE_H */ diff --git a/src/intel/vulkan_hasvk/anv_nir.h b/src/intel/vulkan_hasvk/anv_nir.h new file mode 100644 index 00000000000..86705dfd4f6 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir.h @@ -0,0 +1,97 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ANV_NIR_H +#define ANV_NIR_H + +#include "nir/nir.h" +#include "anv_private.h" + +#ifdef __cplusplus +extern "C" { +#endif + +bool anv_check_for_primitive_replication(struct anv_device *device, + VkShaderStageFlags stages, + nir_shader **shaders, + uint32_t view_mask); + +bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask, + bool use_primitive_replication); + +bool anv_nir_lower_ycbcr_textures(nir_shader *shader, + const struct anv_pipeline_layout *layout); + +static inline nir_address_format +anv_nir_ssbo_addr_format(const struct anv_physical_device *pdevice, + bool robust_buffer_access) +{ + if (pdevice->has_a64_buffer_access) { + if (robust_buffer_access) + return nir_address_format_64bit_bounded_global; + else + return nir_address_format_64bit_global_32bit_offset; + } else { + return nir_address_format_32bit_index_offset; + } +} + +static inline nir_address_format +anv_nir_ubo_addr_format(const struct anv_physical_device *pdevice, + bool robust_buffer_access) +{ + if (pdevice->has_a64_buffer_access) { + if (robust_buffer_access) + return nir_address_format_64bit_bounded_global; + else + return nir_address_format_64bit_global_32bit_offset; + } else { + return nir_address_format_32bit_index_offset; + } +} + +bool anv_nir_lower_ubo_loads(nir_shader *shader); + +void anv_nir_apply_pipeline_layout(nir_shader *shader, + const struct anv_physical_device *pdevice, + bool robust_buffer_access, + const struct anv_pipeline_layout *layout, + struct anv_pipeline_bind_map *map); + +void anv_nir_compute_push_layout(nir_shader *nir, + const struct anv_physical_device *pdevice, + bool robust_buffer_access, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx); + +void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map); + +bool anv_nir_add_base_work_group_id(nir_shader *shader); + +#ifdef __cplusplus +} +#endif + +#endif /* ANV_NIR_H */ diff --git a/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c b/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c new file mode 100644 index 00000000000..1283cb73eaa --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c @@ -0,0 +1,63 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "nir/nir_builder.h" +#include "compiler/brw_compiler.h" + +static bool +anv_nir_add_base_work_group_id_instr(nir_builder *b, + nir_instr *instr, + UNUSED void *cb_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *load_id = nir_instr_as_intrinsic(instr); + if (load_id->intrinsic != nir_intrinsic_load_workgroup_id) + return false; + + b->cursor = nir_after_instr(&load_id->instr); + + nir_ssa_def *load_base = + nir_load_push_constant(b, 3, 32, nir_imm_int(b, 0), + .base = offsetof(struct anv_push_constants, cs.base_work_group_id), + .range = 3 * sizeof(uint32_t)); + + nir_ssa_def *id = nir_iadd(b, &load_id->dest.ssa, load_base); + + nir_ssa_def_rewrite_uses_after(&load_id->dest.ssa, id, id->parent_instr); + return true; +} + +bool +anv_nir_add_base_work_group_id(nir_shader *shader) +{ + assert(shader->info.stage == MESA_SHADER_COMPUTE); + + return nir_shader_instructions_pass(shader, + anv_nir_add_base_work_group_id_instr, + nir_metadata_block_index | + nir_metadata_dominance, + NULL); +} diff --git a/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c new file mode 100644 index 00000000000..0dec0744516 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c @@ -0,0 +1,1686 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "program/prog_parameter.h" +#include "nir/nir_builder.h" +#include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" +#include "util/set.h" + +/* Sampler tables don't actually have a maximum size but we pick one just so + * that we don't end up emitting too much state on-the-fly. + */ +#define MAX_SAMPLER_TABLE_SIZE 128 +#define BINDLESS_OFFSET 255 + +#define sizeof_field(type, field) sizeof(((type *)0)->field) + +struct apply_pipeline_layout_state { + const struct anv_physical_device *pdevice; + + const struct anv_pipeline_layout *layout; + bool add_bounds_checks; + nir_address_format desc_addr_format; + nir_address_format ssbo_addr_format; + nir_address_format ubo_addr_format; + + /* Place to flag lowered instructions so we don't lower them twice */ + struct set *lowered_instrs; + + bool uses_constants; + bool has_dynamic_buffers; + uint8_t constants_offset; + struct { + bool desc_buffer_used; + uint8_t desc_offset; + + uint8_t *use_count; + uint8_t *surface_offsets; + uint8_t *sampler_offsets; + } set[MAX_SETS]; +}; + +static nir_address_format +addr_format_for_desc_type(VkDescriptorType desc_type, + struct apply_pipeline_layout_state *state) +{ + switch (desc_type) { + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + return state->ssbo_addr_format; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + return state->ubo_addr_format; + + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + return state->desc_addr_format; + + default: + unreachable("Unsupported descriptor type"); + } +} + +static void +add_binding(struct apply_pipeline_layout_state *state, + uint32_t set, uint32_t binding) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + if (state->set[set].use_count[binding] < UINT8_MAX) + state->set[set].use_count[binding]++; + + /* Only flag the descriptor buffer as used if there's actually data for + * this binding. This lets us be lazy and call this function constantly + * without worrying about unnecessarily enabling the buffer. + */ + if (bind_layout->descriptor_stride) + state->set[set].desc_buffer_used = true; +} + +static void +add_deref_src_binding(struct apply_pipeline_layout_state *state, nir_src src) +{ + nir_deref_instr *deref = nir_src_as_deref(src); + nir_variable *var = nir_deref_instr_get_variable(deref); + add_binding(state, var->data.descriptor_set, var->data.binding); +} + +static void +add_tex_src_binding(struct apply_pipeline_layout_state *state, + nir_tex_instr *tex, nir_tex_src_type deref_src_type) +{ + int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); + if (deref_src_idx < 0) + return; + + add_deref_src_binding(state, tex->src[deref_src_idx].src); +} + +static bool +get_used_bindings(UNUSED nir_builder *_b, nir_instr *instr, void *_state) +{ + struct apply_pipeline_layout_state *state = _state; + + switch (instr->type) { + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_vulkan_resource_index: + add_binding(state, nir_intrinsic_desc_set(intrin), + nir_intrinsic_binding(intrin)); + break; + + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_fadd: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_param_intel: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: + add_deref_src_binding(state, intrin->src[0]); + break; + + case nir_intrinsic_load_constant: + state->uses_constants = true; + break; + + default: + break; + } + break; + } + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + add_tex_src_binding(state, tex, nir_tex_src_texture_deref); + add_tex_src_binding(state, tex, nir_tex_src_sampler_deref); + break; + } + default: + break; + } + + return false; +} + +static nir_intrinsic_instr * +find_descriptor_for_index_src(nir_src src, + struct apply_pipeline_layout_state *state) +{ + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src); + + while (intrin && intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) + intrin = nir_src_as_intrinsic(intrin->src[0]); + + if (!intrin || intrin->intrinsic != nir_intrinsic_vulkan_resource_index) + return NULL; + + return intrin; +} + +static bool +descriptor_has_bti(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index); + + uint32_t set = nir_intrinsic_desc_set(intrin); + uint32_t binding = nir_intrinsic_binding(intrin); + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + uint32_t surface_index; + if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) + surface_index = state->set[set].desc_offset; + else + surface_index = state->set[set].surface_offsets[binding]; + + /* Only lower to a BTI message if we have a valid binding table index. */ + return surface_index < MAX_BINDING_TABLE_SIZE; +} + +static nir_address_format +descriptor_address_format(nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index); + + return addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state); +} + +static nir_intrinsic_instr * +nir_deref_find_descriptor(nir_deref_instr *deref, + struct apply_pipeline_layout_state *state) +{ + while (1) { + /* Nothing we will use this on has a variable */ + assert(deref->deref_type != nir_deref_type_var); + + nir_deref_instr *parent = nir_src_as_deref(deref->parent); + if (!parent) + break; + + deref = parent; + } + assert(deref->deref_type == nir_deref_type_cast); + + nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent); + if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor) + return false; + + return find_descriptor_for_index_src(intrin->src[0], state); +} + +static nir_ssa_def * +build_load_descriptor_mem(nir_builder *b, + nir_ssa_def *desc_addr, unsigned desc_offset, + unsigned num_components, unsigned bit_size, + struct apply_pipeline_layout_state *state) + +{ + switch (state->desc_addr_format) { + case nir_address_format_64bit_global_32bit_offset: { + nir_ssa_def *base_addr = + nir_pack_64_2x32(b, nir_channels(b, desc_addr, 0x3)); + nir_ssa_def *offset32 = + nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset); + + return nir_load_global_constant_offset(b, num_components, bit_size, + base_addr, offset32, + .align_mul = 8, + .align_offset = desc_offset % 8); + } + + case nir_address_format_32bit_index_offset: { + nir_ssa_def *surface_index = nir_channel(b, desc_addr, 0); + nir_ssa_def *offset32 = + nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset); + + return nir_load_ubo(b, num_components, bit_size, + surface_index, offset32, + .align_mul = 8, + .align_offset = desc_offset % 8, + .range_base = 0, + .range = ~0); + } + + default: + unreachable("Unsupported address format"); + } +} + +/** Build a Vulkan resource index + * + * A "resource index" is the term used by our SPIR-V parser and the relevant + * NIR intrinsics for a reference into a descriptor set. It acts much like a + * deref in NIR except that it accesses opaque descriptors instead of memory. + * + * Coming out of SPIR-V, both the resource indices (in the form of + * vulkan_resource_[re]index intrinsics) and the memory derefs (in the form + * of nir_deref_instr) use the same vector component/bit size. The meaning + * of those values for memory derefs (nir_deref_instr) is given by the + * nir_address_format associated with the descriptor type. For resource + * indices, it's an entirely internal to ANV encoding which describes, in some + * sense, the address of the descriptor. Thanks to the NIR/SPIR-V rules, it + * must be packed into the same size SSA values as a memory address. For this + * reason, the actual encoding may depend both on the address format for + * memory derefs and the descriptor address format. + * + * The load_vulkan_descriptor intrinsic exists to provide a transition point + * between these two forms of derefs: descriptor and memory. + */ +static nir_ssa_def * +build_res_index(nir_builder *b, uint32_t set, uint32_t binding, + nir_ssa_def *array_index, nir_address_format addr_format, + struct apply_pipeline_layout_state *state) +{ + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + uint32_t array_size = bind_layout->array_size; + + switch (addr_format) { + case nir_address_format_64bit_global_32bit_offset: + case nir_address_format_64bit_bounded_global: { + uint32_t set_idx; + switch (state->desc_addr_format) { + case nir_address_format_64bit_global_32bit_offset: + set_idx = set; + break; + + case nir_address_format_32bit_index_offset: + assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE); + set_idx = state->set[set].desc_offset; + break; + + default: + unreachable("Unsupported address format"); + } + + assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS); + uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */ + if (bind_layout->dynamic_offset_index >= 0) { + dynamic_offset_index = + state->layout->set[set].dynamic_offset_start + + bind_layout->dynamic_offset_index; + } + + const uint32_t packed = (bind_layout->descriptor_stride << 16 ) | (set_idx << 8) | dynamic_offset_index; + + return nir_vec4(b, nir_imm_int(b, packed), + nir_imm_int(b, bind_layout->descriptor_offset), + nir_imm_int(b, array_size - 1), + array_index); + } + + case nir_address_format_32bit_index_offset: { + assert(state->desc_addr_format == nir_address_format_32bit_index_offset); + if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + uint32_t surface_index = state->set[set].desc_offset; + return nir_imm_ivec2(b, surface_index, + bind_layout->descriptor_offset); + } else { + uint32_t surface_index = state->set[set].surface_offsets[binding]; + assert(array_size > 0 && array_size <= UINT16_MAX); + assert(surface_index <= UINT16_MAX); + uint32_t packed = ((array_size - 1) << 16) | surface_index; + return nir_vec2(b, array_index, nir_imm_int(b, packed)); + } + } + + default: + unreachable("Unsupported address format"); + } +} + +struct res_index_defs { + nir_ssa_def *set_idx; + nir_ssa_def *dyn_offset_base; + nir_ssa_def *desc_offset_base; + nir_ssa_def *array_index; + nir_ssa_def *desc_stride; +}; + +static struct res_index_defs +unpack_res_index(nir_builder *b, nir_ssa_def *index) +{ + struct res_index_defs defs; + + nir_ssa_def *packed = nir_channel(b, index, 0); + defs.desc_stride = nir_extract_u8(b, packed, nir_imm_int(b, 2)); + defs.set_idx = nir_extract_u8(b, packed, nir_imm_int(b, 1)); + defs.dyn_offset_base = nir_extract_u8(b, packed, nir_imm_int(b, 0)); + + defs.desc_offset_base = nir_channel(b, index, 1); + defs.array_index = nir_umin(b, nir_channel(b, index, 2), + nir_channel(b, index, 3)); + + return defs; +} + +/** Adjust a Vulkan resource index + * + * This is the equivalent of nir_deref_type_ptr_as_array for resource indices. + * For array descriptors, it allows us to adjust the array index. Thanks to + * variable pointers, we cannot always fold this re-index operation into the + * vulkan_resource_index intrinsic and we have to do it based on nothing but + * the address format. + */ +static nir_ssa_def * +build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta, + nir_address_format addr_format) +{ + switch (addr_format) { + case nir_address_format_64bit_global_32bit_offset: + case nir_address_format_64bit_bounded_global: + return nir_vec4(b, nir_channel(b, orig, 0), + nir_channel(b, orig, 1), + nir_channel(b, orig, 2), + nir_iadd(b, nir_channel(b, orig, 3), delta)); + + case nir_address_format_32bit_index_offset: + return nir_vec2(b, nir_iadd(b, nir_channel(b, orig, 0), delta), + nir_channel(b, orig, 1)); + + default: + unreachable("Unhandled address format"); + } +} + +/** Get the address for a descriptor given its resource index + * + * Because of the re-indexing operations, we can't bounds check descriptor + * array access until we have the final index. That means we end up doing the + * bounds check here, if needed. See unpack_res_index() for more details. + * + * This function takes both a bind_layout and a desc_type which are used to + * determine the descriptor stride for array descriptors. The bind_layout is + * optional for buffer descriptor types. + */ +static nir_ssa_def * +build_desc_addr(nir_builder *b, + const struct anv_descriptor_set_binding_layout *bind_layout, + const VkDescriptorType desc_type, + nir_ssa_def *index, nir_address_format addr_format, + struct apply_pipeline_layout_state *state) +{ + switch (addr_format) { + case nir_address_format_64bit_global_32bit_offset: + case nir_address_format_64bit_bounded_global: { + struct res_index_defs res = unpack_res_index(b, index); + + nir_ssa_def *desc_offset = res.desc_offset_base; + if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Compute the actual descriptor offset. For inline uniform blocks, + * the array index is ignored as they are only allowed to be a single + * descriptor (not an array) and there is no concept of a "stride". + * + */ + desc_offset = + nir_iadd(b, desc_offset, nir_imul(b, res.array_index, res.desc_stride)); + } + + switch (state->desc_addr_format) { + case nir_address_format_64bit_global_32bit_offset: { + nir_ssa_def *base_addr = + nir_load_desc_set_address_intel(b, res.set_idx); + return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr), + nir_unpack_64_2x32_split_y(b, base_addr), + nir_imm_int(b, UINT32_MAX), + desc_offset); + } + + case nir_address_format_32bit_index_offset: + return nir_vec2(b, res.set_idx, desc_offset); + + default: + unreachable("Unhandled address format"); + } + } + + case nir_address_format_32bit_index_offset: + assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK); + assert(state->desc_addr_format == nir_address_format_32bit_index_offset); + return index; + + default: + unreachable("Unhandled address format"); + } +} + +/** Convert a Vulkan resource index into a buffer address + * + * In some cases, this does a memory load from the descriptor set and, in + * others, it simply converts from one form to another. + * + * See build_res_index for details about each resource index format. + */ +static nir_ssa_def * +build_buffer_addr_for_res_index(nir_builder *b, + const VkDescriptorType desc_type, + nir_ssa_def *res_index, + nir_address_format addr_format, + struct apply_pipeline_layout_state *state) +{ + if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + assert(addr_format == state->desc_addr_format); + return build_desc_addr(b, NULL, desc_type, res_index, addr_format, state); + } else if (addr_format == nir_address_format_32bit_index_offset) { + nir_ssa_def *array_index = nir_channel(b, res_index, 0); + nir_ssa_def *packed = nir_channel(b, res_index, 1); + nir_ssa_def *array_max = nir_extract_u16(b, packed, nir_imm_int(b, 1)); + nir_ssa_def *surface_index = nir_extract_u16(b, packed, nir_imm_int(b, 0)); + + if (state->add_bounds_checks) + array_index = nir_umin(b, array_index, array_max); + + return nir_vec2(b, nir_iadd(b, surface_index, array_index), + nir_imm_int(b, 0)); + } + + nir_ssa_def *desc_addr = + build_desc_addr(b, NULL, desc_type, res_index, addr_format, state); + + nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state); + + if (state->has_dynamic_buffers) { + struct res_index_defs res = unpack_res_index(b, res_index); + + /* This shader has dynamic offsets and we have no way of knowing + * (save from the dynamic offset base index) if this buffer has a + * dynamic offset. + */ + nir_ssa_def *dyn_offset_idx = + nir_iadd(b, res.dyn_offset_base, res.array_index); + if (state->add_bounds_checks) { + dyn_offset_idx = nir_umin(b, dyn_offset_idx, + nir_imm_int(b, MAX_DYNAMIC_BUFFERS)); + } + + nir_ssa_def *dyn_load = + nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4), + .base = offsetof(struct anv_push_constants, dynamic_offsets), + .range = MAX_DYNAMIC_BUFFERS * 4); + + nir_ssa_def *dynamic_offset = + nir_bcsel(b, nir_ieq_imm(b, res.dyn_offset_base, 0xff), + nir_imm_int(b, 0), dyn_load); + + /* The dynamic offset gets added to the base pointer so that we + * have a sliding window range. + */ + nir_ssa_def *base_ptr = + nir_pack_64_2x32(b, nir_channels(b, desc, 0x3)); + base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset)); + desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr), + nir_unpack_64_2x32_split_y(b, base_ptr), + nir_channel(b, desc, 2), + nir_channel(b, desc, 3)); + } + + /* The last element of the vec4 is always zero. + * + * See also struct anv_address_range_descriptor + */ + return nir_vec4(b, nir_channel(b, desc, 0), + nir_channel(b, desc, 1), + nir_channel(b, desc, 2), + nir_imm_int(b, 0)); +} + +/** Loads descriptor memory for a variable-based deref chain + * + * The deref chain has to terminate at a variable with a descriptor_set and + * binding set. This is used for images, textures, and samplers. + */ +static nir_ssa_def * +build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref, + unsigned desc_offset, + unsigned num_components, unsigned bit_size, + struct apply_pipeline_layout_state *state) +{ + nir_variable *var = nir_deref_instr_get_variable(deref); + + const uint32_t set = var->data.descriptor_set; + const uint32_t binding = var->data.binding; + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + nir_ssa_def *array_index; + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var); + assert(deref->arr.index.is_ssa); + array_index = deref->arr.index.ssa; + } else { + array_index = nir_imm_int(b, 0); + } + + /* It doesn't really matter what address format we choose as everything + * will constant-fold nicely. Choose one that uses the actual descriptor + * buffer so we don't run into issues index/offset assumptions. + */ + const nir_address_format addr_format = + nir_address_format_64bit_bounded_global; + + nir_ssa_def *res_index = + build_res_index(b, set, binding, array_index, addr_format, state); + + nir_ssa_def *desc_addr = + build_desc_addr(b, bind_layout, bind_layout->type, + res_index, addr_format, state); + + return build_load_descriptor_mem(b, desc_addr, desc_offset, + num_components, bit_size, state); +} + +/** A recursive form of build_res_index() + * + * This recursively walks a resource [re]index chain and builds the resource + * index. It places the new code with the resource [re]index operation in the + * hopes of better CSE. This means the cursor is not where you left it when + * this function returns. + */ +static nir_ssa_def * +build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin, + nir_address_format addr_format, + uint32_t *set, uint32_t *binding, + struct apply_pipeline_layout_state *state) +{ + if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) { + b->cursor = nir_before_instr(&intrin->instr); + assert(intrin->src[0].is_ssa); + *set = nir_intrinsic_desc_set(intrin); + *binding = nir_intrinsic_binding(intrin); + return build_res_index(b, *set, *binding, intrin->src[0].ssa, + addr_format, state); + } else { + assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex); + nir_intrinsic_instr *parent = nir_src_as_intrinsic(intrin->src[0]); + nir_ssa_def *index = + build_res_index_for_chain(b, parent, addr_format, + set, binding, state); + + b->cursor = nir_before_instr(&intrin->instr); + + assert(intrin->src[1].is_ssa); + return build_res_reindex(b, index, intrin->src[1].ssa, addr_format); + } +} + +/** Builds a buffer address for a given vulkan [re]index intrinsic + * + * The cursor is not where you left it when this function returns. + */ +static nir_ssa_def * +build_buffer_addr_for_idx_intrin(nir_builder *b, + nir_intrinsic_instr *idx_intrin, + nir_address_format addr_format, + struct apply_pipeline_layout_state *state) +{ + uint32_t set = UINT32_MAX, binding = UINT32_MAX; + nir_ssa_def *res_index = + build_res_index_for_chain(b, idx_intrin, addr_format, + &set, &binding, state); + + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + return build_buffer_addr_for_res_index(b, bind_layout->type, + res_index, addr_format, state); +} + +/** Builds a buffer address for deref chain + * + * This assumes that you can chase the chain all the way back to the original + * vulkan_resource_index intrinsic. + * + * The cursor is not where you left it when this function returns. + */ +static nir_ssa_def * +build_buffer_addr_for_deref(nir_builder *b, nir_deref_instr *deref, + nir_address_format addr_format, + struct apply_pipeline_layout_state *state) +{ + nir_deref_instr *parent = nir_deref_instr_parent(deref); + if (parent) { + nir_ssa_def *addr = + build_buffer_addr_for_deref(b, parent, addr_format, state); + + b->cursor = nir_before_instr(&deref->instr); + return nir_explicit_io_address_from_deref(b, deref, addr, addr_format); + } + + nir_intrinsic_instr *load_desc = nir_src_as_intrinsic(deref->parent); + assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor); + + nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(load_desc->src[0]); + + b->cursor = nir_before_instr(&deref->instr); + + return build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state); +} + +static bool +try_lower_direct_buffer_intrinsic(nir_builder *b, + nir_intrinsic_instr *intrin, bool is_atomic, + struct apply_pipeline_layout_state *state) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo)) + return false; + + nir_intrinsic_instr *desc = nir_deref_find_descriptor(deref, state); + if (desc == NULL) { + /* We should always be able to find the descriptor for UBO access. */ + assert(nir_deref_mode_is_one_of(deref, nir_var_mem_ssbo)); + return false; + } + + nir_address_format addr_format = descriptor_address_format(desc, state); + + if (nir_deref_mode_is(deref, nir_var_mem_ssbo)) { + /* 64-bit atomics only support A64 messages so we can't lower them to + * the index+offset model. + */ + if (is_atomic && nir_dest_bit_size(intrin->dest) == 64 && + !state->pdevice->info.has_lsc) + return false; + + /* Normal binding table-based messages can't handle non-uniform access + * so we have to fall back to A64. + */ + if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM) + return false; + + if (!descriptor_has_bti(desc, state)) + return false; + + /* Rewrite to 32bit_index_offset whenever we can */ + addr_format = nir_address_format_32bit_index_offset; + } else { + assert(nir_deref_mode_is(deref, nir_var_mem_ubo)); + + /* Rewrite to 32bit_index_offset whenever we can */ + if (descriptor_has_bti(desc, state)) + addr_format = nir_address_format_32bit_index_offset; + } + + nir_ssa_def *addr = + build_buffer_addr_for_deref(b, deref, addr_format, state); + + b->cursor = nir_before_instr(&intrin->instr); + nir_lower_explicit_io_instr(b, intrin, addr, addr_format); + + return true; +} + +static bool +lower_load_accel_struct_desc(nir_builder *b, + nir_intrinsic_instr *load_desc, + struct apply_pipeline_layout_state *state) +{ + assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor); + + nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(load_desc->src[0]); + + /* It doesn't really matter what address format we choose as + * everything will constant-fold nicely. Choose one that uses the + * actual descriptor buffer. + */ + const nir_address_format addr_format = + nir_address_format_64bit_bounded_global; + + uint32_t set = UINT32_MAX, binding = UINT32_MAX; + nir_ssa_def *res_index = + build_res_index_for_chain(b, idx_intrin, addr_format, + &set, &binding, state); + + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + b->cursor = nir_before_instr(&load_desc->instr); + + nir_ssa_def *desc_addr = + build_desc_addr(b, bind_layout, bind_layout->type, + res_index, addr_format, state); + + /* Acceleration structure descriptors are always uint64_t */ + nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state); + + assert(load_desc->dest.is_ssa); + assert(load_desc->dest.ssa.bit_size == 64); + assert(load_desc->dest.ssa.num_components == 1); + nir_ssa_def_rewrite_uses(&load_desc->dest.ssa, desc); + nir_instr_remove(&load_desc->instr); + + return true; +} + +static bool +lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state) +{ + struct apply_pipeline_layout_state *state = _state; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_deref: + case nir_intrinsic_store_deref: + return try_lower_direct_buffer_intrinsic(b, intrin, false, state); + + case nir_intrinsic_deref_atomic_add: + case nir_intrinsic_deref_atomic_imin: + case nir_intrinsic_deref_atomic_umin: + case nir_intrinsic_deref_atomic_imax: + case nir_intrinsic_deref_atomic_umax: + case nir_intrinsic_deref_atomic_and: + case nir_intrinsic_deref_atomic_or: + case nir_intrinsic_deref_atomic_xor: + case nir_intrinsic_deref_atomic_exchange: + case nir_intrinsic_deref_atomic_comp_swap: + case nir_intrinsic_deref_atomic_fadd: + case nir_intrinsic_deref_atomic_fmin: + case nir_intrinsic_deref_atomic_fmax: + case nir_intrinsic_deref_atomic_fcomp_swap: + return try_lower_direct_buffer_intrinsic(b, intrin, true, state); + + case nir_intrinsic_get_ssbo_size: { + /* The get_ssbo_size intrinsic always just takes a + * index/reindex intrinsic. + */ + nir_intrinsic_instr *idx_intrin = + find_descriptor_for_index_src(intrin->src[0], state); + if (idx_intrin == NULL || !descriptor_has_bti(idx_intrin, state)) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + /* We just checked that this is a BTI descriptor */ + const nir_address_format addr_format = + nir_address_format_32bit_index_offset; + + nir_ssa_def *buffer_addr = + build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state); + + b->cursor = nir_before_instr(&intrin->instr); + nir_ssa_def *bti = nir_channel(b, buffer_addr, 0); + + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(bti)); + _mesa_set_add(state->lowered_instrs, intrin); + return true; + } + + case nir_intrinsic_load_vulkan_descriptor: + if (nir_intrinsic_desc_type(intrin) == + VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) + return lower_load_accel_struct_desc(b, intrin, state); + return false; + + default: + return false; + } +} + +static bool +lower_res_index_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_address_format addr_format = + addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state); + + assert(intrin->src[0].is_ssa); + nir_ssa_def *index = + build_res_index(b, nir_intrinsic_desc_set(intrin), + nir_intrinsic_binding(intrin), + intrin->src[0].ssa, + addr_format, state); + + assert(intrin->dest.is_ssa); + assert(intrin->dest.ssa.bit_size == index->bit_size); + assert(intrin->dest.ssa.num_components == index->num_components); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_res_reindex_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_before_instr(&intrin->instr); + + nir_address_format addr_format = + addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state); + + assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa); + nir_ssa_def *index = + build_res_reindex(b, intrin->src[0].ssa, + intrin->src[1].ssa, + addr_format); + + assert(intrin->dest.is_ssa); + assert(intrin->dest.ssa.bit_size == index->bit_size); + assert(intrin->dest.ssa.num_components == index->num_components); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_before_instr(&intrin->instr); + + const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin); + nir_address_format addr_format = addr_format_for_desc_type(desc_type, state); + + assert(intrin->dest.is_ssa); + nir_foreach_use(src, &intrin->dest.ssa) { + if (src->parent_instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *cast = nir_instr_as_deref(src->parent_instr); + assert(cast->deref_type == nir_deref_type_cast); + switch (desc_type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + cast->cast.align_mul = ANV_UBO_ALIGNMENT; + cast->cast.align_offset = 0; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + cast->cast.align_mul = ANV_SSBO_ALIGNMENT; + cast->cast.align_offset = 0; + break; + + default: + break; + } + } + + assert(intrin->src[0].is_ssa); + nir_ssa_def *desc = + build_buffer_addr_for_res_index(b, desc_type, intrin->src[0].ssa, + addr_format, state); + + assert(intrin->dest.is_ssa); + assert(intrin->dest.ssa.bit_size == desc->bit_size); + assert(intrin->dest.ssa.num_components == desc->num_components); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +lower_get_ssbo_size(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + if (_mesa_set_search(state->lowered_instrs, intrin)) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_address_format addr_format = + addr_format_for_desc_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, state); + + assert(intrin->src[0].is_ssa); + nir_ssa_def *desc = + build_buffer_addr_for_res_index(b, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + intrin->src[0].ssa, addr_format, state); + + switch (addr_format) { + case nir_address_format_64bit_global_32bit_offset: + case nir_address_format_64bit_bounded_global: { + nir_ssa_def *size = nir_channel(b, desc, 2); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, size); + nir_instr_remove(&intrin->instr); + break; + } + + case nir_address_format_32bit_index_offset: + /* The binding table index is the first component of the address. The + * back-end wants a scalar binding table index source. + */ + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], + nir_src_for_ssa(nir_channel(b, desc, 0))); + break; + + default: + unreachable("Unsupported address format"); + } + + return true; +} + +static bool +image_binding_needs_lowered_surface(nir_variable *var) +{ + return !(var->data.access & ACCESS_NON_READABLE) && + var->data.image.format != PIPE_FORMAT_NONE; +} + +static bool +lower_image_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + unsigned binding_offset = state->set[set].surface_offsets[binding]; + + b->cursor = nir_before_instr(&intrin->instr); + + ASSERTED const bool use_bindless = state->pdevice->has_bindless_images; + + if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) { + b->cursor = nir_instr_remove(&intrin->instr); + + assert(!use_bindless); /* Otherwise our offsets would be wrong */ + const unsigned param = nir_intrinsic_base(intrin); + + nir_ssa_def *desc = + build_load_var_deref_descriptor_mem(b, deref, param * 16, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, state); + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc); + } else if (binding_offset > MAX_BINDING_TABLE_SIZE) { + const unsigned desc_comp = + image_binding_needs_lowered_surface(var) ? 1 : 0; + nir_ssa_def *desc = + build_load_var_deref_descriptor_mem(b, deref, 0, 2, 32, state); + nir_ssa_def *handle = nir_channel(b, desc, desc_comp); + nir_rewrite_image_intrinsic(intrin, handle, true); + } else { + unsigned array_size = + state->layout->set[set].layout->binding[binding].array_size; + + nir_ssa_def *index = NULL; + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + index = nir_ssa_for_src(b, deref->arr.index, 1); + if (state->add_bounds_checks) + index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + } else { + index = nir_imm_int(b, 0); + } + + index = nir_iadd_imm(b, index, binding_offset); + nir_rewrite_image_intrinsic(intrin, index, false); + } + + return true; +} + +static bool +lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_instr_remove(&intrin->instr); + + /* Any constant-offset load_constant instructions should have been removed + * by constant folding. + */ + assert(!nir_src_is_const(intrin->src[0])); + nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, intrin->src[0], 1), + nir_intrinsic_base(intrin)); + + nir_ssa_def *data; + if (!anv_use_relocations(state->pdevice)) { + unsigned load_size = intrin->dest.ssa.num_components * + intrin->dest.ssa.bit_size / 8; + unsigned load_align = intrin->dest.ssa.bit_size / 8; + + assert(load_size < b->shader->constant_data_size); + unsigned max_offset = b->shader->constant_data_size - load_size; + offset = nir_umin(b, offset, nir_imm_int(b, max_offset)); + + nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(b, + nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW), + nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH)); + + data = nir_load_global_constant(b, nir_iadd(b, const_data_base_addr, + nir_u2u64(b, offset)), + load_align, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size); + } else { + nir_ssa_def *index = nir_imm_int(b, state->constants_offset); + + data = nir_load_ubo(b, intrin->num_components, intrin->dest.ssa.bit_size, + index, offset, + .align_mul = intrin->dest.ssa.bit_size / 8, + .align_offset = 0, + .range_base = nir_intrinsic_base(intrin), + .range = nir_intrinsic_range(intrin)); + } + + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, data); + + return true; +} + +static void +lower_tex_deref(nir_builder *b, nir_tex_instr *tex, + nir_tex_src_type deref_src_type, + unsigned *base_index, unsigned plane, + struct apply_pipeline_layout_state *state) +{ + int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type); + if (deref_src_idx < 0) + return; + + nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + unsigned array_size = + state->layout->set[set].layout->binding[binding].array_size; + + unsigned binding_offset; + if (deref_src_type == nir_tex_src_texture_deref) { + binding_offset = state->set[set].surface_offsets[binding]; + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + binding_offset = state->set[set].sampler_offsets[binding]; + } + + nir_tex_src_type offset_src_type; + nir_ssa_def *index = NULL; + if (binding_offset > MAX_BINDING_TABLE_SIZE) { + const unsigned plane_offset = + plane * sizeof(struct anv_sampled_image_descriptor); + + nir_ssa_def *desc = + build_load_var_deref_descriptor_mem(b, deref, plane_offset, + 2, 32, state); + + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_handle; + index = nir_channel(b, desc, 0); + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_handle; + index = nir_channel(b, desc, 1); + } + } else { + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_offset; + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_offset; + } + + *base_index = binding_offset + plane; + + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + if (nir_src_is_const(deref->arr.index)) { + unsigned arr_index = MIN2(nir_src_as_uint(deref->arr.index), array_size - 1); + struct anv_sampler **immutable_samplers = + state->layout->set[set].layout->binding[binding].immutable_samplers; + if (immutable_samplers) { + /* Array of YCbCr samplers are tightly packed in the binding + * tables, compute the offset of an element in the array by + * adding the number of planes of all preceding elements. + */ + unsigned desc_arr_index = 0; + for (int i = 0; i < arr_index; i++) + desc_arr_index += immutable_samplers[i]->n_planes; + *base_index += desc_arr_index; + } else { + *base_index += arr_index; + } + } else { + /* From VK_KHR_sampler_ycbcr_conversion: + * + * If sampler Y’CBCR conversion is enabled, the combined image + * sampler must be indexed only by constant integral expressions + * when aggregated into arrays in shader code, irrespective of + * the shaderSampledImageArrayDynamicIndexing feature. + */ + assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1); + + index = nir_ssa_for_src(b, deref->arr.index, 1); + + if (state->add_bounds_checks) + index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + } + } + } + + if (index) { + nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src, + nir_src_for_ssa(index)); + tex->src[deref_src_idx].src_type = offset_src_type; + } else { + nir_tex_instr_remove_src(tex, deref_src_idx); + } +} + +static uint32_t +tex_instr_get_and_remove_plane_src(nir_tex_instr *tex) +{ + int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane); + if (plane_src_idx < 0) + return 0; + + unsigned plane = nir_src_as_uint(tex->src[plane_src_idx].src); + + nir_tex_instr_remove_src(tex, plane_src_idx); + + return plane; +} + +static nir_ssa_def * +build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx, + unsigned start, unsigned end) +{ + if (start == end - 1) { + return srcs[start]; + } else { + unsigned mid = start + (end - start) / 2; + return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)), + build_def_array_select(b, srcs, idx, start, mid), + build_def_array_select(b, srcs, idx, mid, end)); + } +} + +static void +lower_gfx7_tex_swizzle(nir_builder *b, nir_tex_instr *tex, unsigned plane, + struct apply_pipeline_layout_state *state) +{ + assert(state->pdevice->info.verx10 == 70); + if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF || + nir_tex_instr_is_query(tex) || + tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */ + (tex->is_shadow && tex->is_new_style_shadow)) + return; + + int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); + assert(deref_src_idx >= 0); + + nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); + nir_variable *var = nir_deref_instr_get_variable(deref); + + unsigned set = var->data.descriptor_set; + unsigned binding = var->data.binding; + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + + if ((bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) == 0) + return; + + b->cursor = nir_before_instr(&tex->instr); + + const unsigned plane_offset = + plane * sizeof(struct anv_texture_swizzle_descriptor); + nir_ssa_def *swiz = + build_load_var_deref_descriptor_mem(b, deref, plane_offset, + 1, 32, state); + + b->cursor = nir_after_instr(&tex->instr); + + assert(tex->dest.ssa.bit_size == 32); + assert(tex->dest.ssa.num_components == 4); + + /* Initializing to undef is ok; nir_opt_undef will clean it up. */ + nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); + nir_ssa_def *comps[8]; + for (unsigned i = 0; i < ARRAY_SIZE(comps); i++) + comps[i] = undef; + + comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0); + if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float) + comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1); + else + comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1); + comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0); + comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1); + comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2); + comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3); + + nir_ssa_def *swiz_comps[4]; + for (unsigned i = 0; i < 4; i++) { + nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i)); + swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8); + } + nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4); + + /* Rewrite uses before we insert so we don't rewrite this use */ + nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, + swiz_tex_res, + swiz_tex_res->parent_instr); +} + +static bool +lower_tex(nir_builder *b, nir_tex_instr *tex, + struct apply_pipeline_layout_state *state) +{ + unsigned plane = tex_instr_get_and_remove_plane_src(tex); + + /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader. Do this + * before we lower the derefs away so we can still find the descriptor. + */ + if (state->pdevice->info.verx10 == 70) + lower_gfx7_tex_swizzle(b, tex, plane, state); + + b->cursor = nir_before_instr(&tex->instr); + + lower_tex_deref(b, tex, nir_tex_src_texture_deref, + &tex->texture_index, plane, state); + + lower_tex_deref(b, tex, nir_tex_src_sampler_deref, + &tex->sampler_index, plane, state); + + return true; +} + +static bool +lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + b->cursor = nir_instr_remove(&intrin->instr); + + nir_ssa_def *rq_globals = + nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0), + .base = offsetof(struct anv_push_constants, ray_query_globals), + .range = sizeof_field(struct anv_push_constants, ray_query_globals)); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, rq_globals); + + return true; +} + +static bool +apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state) +{ + struct apply_pipeline_layout_state *state = _state; + + switch (instr->type) { + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_vulkan_resource_index: + return lower_res_index_intrinsic(b, intrin, state); + case nir_intrinsic_vulkan_resource_reindex: + return lower_res_reindex_intrinsic(b, intrin, state); + case nir_intrinsic_load_vulkan_descriptor: + return lower_load_vulkan_descriptor(b, intrin, state); + case nir_intrinsic_get_ssbo_size: + return lower_get_ssbo_size(b, intrin, state); + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_fadd: + case nir_intrinsic_image_deref_size: + case nir_intrinsic_image_deref_samples: + case nir_intrinsic_image_deref_load_param_intel: + case nir_intrinsic_image_deref_load_raw_intel: + case nir_intrinsic_image_deref_store_raw_intel: + return lower_image_intrinsic(b, intrin, state); + case nir_intrinsic_load_constant: + return lower_load_constant(b, intrin, state); + case nir_intrinsic_load_ray_query_global_intel: + return lower_ray_query_globals(b, intrin, state); + default: + return false; + } + break; + } + case nir_instr_type_tex: + return lower_tex(b, nir_instr_as_tex(instr), state); + default: + return false; + } +} + +struct binding_info { + uint32_t binding; + uint8_t set; + uint16_t score; +}; + +static int +compare_binding_infos(const void *_a, const void *_b) +{ + const struct binding_info *a = _a, *b = _b; + if (a->score != b->score) + return b->score - a->score; + + if (a->set != b->set) + return a->set - b->set; + + return a->binding - b->binding; +} + +void +anv_nir_apply_pipeline_layout(nir_shader *shader, + const struct anv_physical_device *pdevice, + bool robust_buffer_access, + const struct anv_pipeline_layout *layout, + struct anv_pipeline_bind_map *map) +{ + void *mem_ctx = ralloc_context(NULL); + + struct apply_pipeline_layout_state state = { + .pdevice = pdevice, + .layout = layout, + .add_bounds_checks = robust_buffer_access, + .desc_addr_format = + brw_shader_stage_requires_bindless_resources(shader->info.stage) ? + nir_address_format_64bit_global_32bit_offset : + nir_address_format_32bit_index_offset, + .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access), + .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_buffer_access), + .lowered_instrs = _mesa_pointer_set_create(mem_ctx), + }; + + for (unsigned s = 0; s < layout->num_sets; s++) { + const unsigned count = layout->set[s].layout->binding_count; + state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count); + state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count); + state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count); + } + + nir_shader_instructions_pass(shader, get_used_bindings, + nir_metadata_all, &state); + + for (unsigned s = 0; s < layout->num_sets; s++) { + if (state.desc_addr_format != nir_address_format_32bit_index_offset) { + state.set[s].desc_offset = BINDLESS_OFFSET; + } else if (state.set[s].desc_buffer_used) { + map->surface_to_descriptor[map->surface_count] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .index = s, + }; + state.set[s].desc_offset = map->surface_count; + map->surface_count++; + } + } + + if (state.uses_constants && anv_use_relocations(pdevice)) { + state.constants_offset = map->surface_count; + map->surface_to_descriptor[map->surface_count].set = + ANV_DESCRIPTOR_SET_SHADER_CONSTANTS; + map->surface_count++; + } + + unsigned used_binding_count = 0; + for (uint32_t set = 0; set < layout->num_sets; set++) { + struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; + for (unsigned b = 0; b < set_layout->binding_count; b++) { + if (state.set[set].use_count[b] == 0) + continue; + + used_binding_count++; + } + } + + struct binding_info *infos = + rzalloc_array(mem_ctx, struct binding_info, used_binding_count); + used_binding_count = 0; + for (uint32_t set = 0; set < layout->num_sets; set++) { + const struct anv_descriptor_set_layout *set_layout = layout->set[set].layout; + for (unsigned b = 0; b < set_layout->binding_count; b++) { + if (state.set[set].use_count[b] == 0) + continue; + + const struct anv_descriptor_set_binding_layout *binding = + &layout->set[set].layout->binding[b]; + + /* Do a fixed-point calculation to generate a score based on the + * number of uses and the binding array size. We shift by 7 instead + * of 8 because we're going to use the top bit below to make + * everything which does not support bindless super higher priority + * than things which do. + */ + uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) / + binding->array_size; + + /* If the descriptor type doesn't support bindless then put it at the + * beginning so we guarantee it gets a slot. + */ + if (!anv_descriptor_supports_bindless(pdevice, binding, true) || + !anv_descriptor_supports_bindless(pdevice, binding, false)) + score |= 1 << 15; + + infos[used_binding_count++] = (struct binding_info) { + .set = set, + .binding = b, + .score = score, + }; + } + } + + /* Order the binding infos based on score with highest scores first. If + * scores are equal we then order by set and binding. + */ + qsort(infos, used_binding_count, sizeof(struct binding_info), + compare_binding_infos); + + for (unsigned i = 0; i < used_binding_count; i++) { + unsigned set = infos[i].set, b = infos[i].binding; + const struct anv_descriptor_set_binding_layout *binding = + &layout->set[set].layout->binding[b]; + + const uint32_t array_size = binding->array_size; + + if (binding->dynamic_offset_index >= 0) + state.has_dynamic_buffers = true; + + if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) { + if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE || + anv_descriptor_requires_bindless(pdevice, binding, false) || + brw_shader_stage_requires_bindless_resources(shader->info.stage)) { + /* If this descriptor doesn't fit in the binding table or if it + * requires bindless for some reason, flag it as bindless. + */ + assert(anv_descriptor_supports_bindless(pdevice, binding, false)); + state.set[set].surface_offsets[b] = BINDLESS_OFFSET; + } else { + state.set[set].surface_offsets[b] = map->surface_count; + if (binding->dynamic_offset_index < 0) { + struct anv_sampler **samplers = binding->immutable_samplers; + for (unsigned i = 0; i < binding->array_size; i++) { + uint8_t planes = samplers ? samplers[i]->n_planes : 1; + for (uint8_t p = 0; p < planes; p++) { + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = set, + .index = binding->descriptor_index + i, + .plane = p, + }; + } + } + } else { + for (unsigned i = 0; i < binding->array_size; i++) { + map->surface_to_descriptor[map->surface_count++] = + (struct anv_pipeline_binding) { + .set = set, + .index = binding->descriptor_index + i, + .dynamic_offset_index = + layout->set[set].dynamic_offset_start + + binding->dynamic_offset_index + i, + }; + } + } + } + assert(map->surface_count <= MAX_BINDING_TABLE_SIZE); + } + + if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) { + if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE || + anv_descriptor_requires_bindless(pdevice, binding, true) || + brw_shader_stage_requires_bindless_resources(shader->info.stage)) { + /* If this descriptor doesn't fit in the binding table or if it + * requires bindless for some reason, flag it as bindless. + * + * We also make large sampler arrays bindless because we can avoid + * using indirect sends thanks to bindless samplers being packed + * less tightly than the sampler table. + */ + assert(anv_descriptor_supports_bindless(pdevice, binding, true)); + state.set[set].sampler_offsets[b] = BINDLESS_OFFSET; + } else { + state.set[set].sampler_offsets[b] = map->sampler_count; + struct anv_sampler **samplers = binding->immutable_samplers; + for (unsigned i = 0; i < binding->array_size; i++) { + uint8_t planes = samplers ? samplers[i]->n_planes : 1; + for (uint8_t p = 0; p < planes; p++) { + map->sampler_to_descriptor[map->sampler_count++] = + (struct anv_pipeline_binding) { + .set = set, + .index = binding->descriptor_index + i, + .plane = p, + }; + } + } + } + } + } + + nir_foreach_image_variable(var, shader) { + const uint32_t set = var->data.descriptor_set; + const uint32_t binding = var->data.binding; + const struct anv_descriptor_set_binding_layout *bind_layout = + &layout->set[set].layout->binding[binding]; + const uint32_t array_size = bind_layout->array_size; + + if (state.set[set].use_count[binding] == 0) + continue; + + if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE) + continue; + + struct anv_pipeline_binding *pipe_binding = + &map->surface_to_descriptor[state.set[set].surface_offsets[binding]]; + for (unsigned i = 0; i < array_size; i++) { + assert(pipe_binding[i].set == set); + assert(pipe_binding[i].index == bind_layout->descriptor_index + i); + + pipe_binding[i].lowered_storage_surface = + image_binding_needs_lowered_surface(var); + } + } + + /* Before we do the normal lowering, we look for any SSBO operations + * that we can lower to the BTI model and lower them up-front. The BTI + * model can perform better than the A64 model for a couple reasons: + * + * 1. 48-bit address calculations are potentially expensive and using + * the BTI model lets us simply compute 32-bit offsets and the + * hardware adds the 64-bit surface base address. + * + * 2. The BTI messages, because they use surface states, do bounds + * checking for us. With the A64 model, we have to do our own + * bounds checking and this means wider pointers and extra + * calculations and branching in the shader. + * + * The solution to both of these is to convert things to the BTI model + * opportunistically. The reason why we need to do this as a pre-pass + * is for two reasons: + * + * 1. The BTI model requires nir_address_format_32bit_index_offset + * pointers which are not the same type as the pointers needed for + * the A64 model. Because all our derefs are set up for the A64 + * model (in case we have variable pointers), we have to crawl all + * the way back to the vulkan_resource_index intrinsic and build a + * completely fresh index+offset calculation. + * + * 2. Because the variable-pointers-capable lowering that we do as part + * of apply_pipeline_layout_block is destructive (It really has to + * be to handle variable pointers properly), we've lost the deref + * information by the time we get to the load/store/atomic + * intrinsics in that pass. + */ + nir_shader_instructions_pass(shader, lower_direct_buffer_instr, + nir_metadata_block_index | + nir_metadata_dominance, + &state); + + /* We just got rid of all the direct access. Delete it so it's not in the + * way when we do our indirect lowering. + */ + nir_opt_dce(shader); + + nir_shader_instructions_pass(shader, apply_pipeline_layout, + nir_metadata_block_index | + nir_metadata_dominance, + &state); + + ralloc_free(mem_ctx); + + if (brw_shader_stage_is_bindless(shader->info.stage)) { + assert(map->surface_count == 0); + assert(map->sampler_count == 0); + } + + /* Now that we're done computing the surface and sampler portions of the + * bind map, hash them. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->surface_to_descriptor, + map->surface_count * sizeof(struct anv_pipeline_binding), + map->surface_sha1); + _mesa_sha1_compute(map->sampler_to_descriptor, + map->sampler_count * sizeof(struct anv_pipeline_binding), + map->sampler_sha1); +} diff --git a/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c b/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c new file mode 100644 index 00000000000..2385c5aea20 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c @@ -0,0 +1,290 @@ +/* + * Copyright © 2019 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "nir_builder.h" +#include "compiler/brw_nir.h" +#include "util/mesa-sha1.h" + +#define sizeof_field(type, field) sizeof(((type *)0)->field) + +void +anv_nir_compute_push_layout(nir_shader *nir, + const struct anv_physical_device *pdevice, + bool robust_buffer_access, + struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map, + void *mem_ctx) +{ + const struct brw_compiler *compiler = pdevice->compiler; + const struct intel_device_info *devinfo = compiler->devinfo; + memset(map->push_ranges, 0, sizeof(map->push_ranges)); + + bool has_const_ubo = false; + unsigned push_start = UINT_MAX, push_end = 0; + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ubo: + if (nir_src_is_const(intrin->src[0]) && + nir_src_is_const(intrin->src[1])) + has_const_ubo = true; + break; + + case nir_intrinsic_load_push_constant: { + unsigned base = nir_intrinsic_base(intrin); + unsigned range = nir_intrinsic_range(intrin); + push_start = MIN2(push_start, base); + push_end = MAX2(push_end, base + range); + break; + } + + case nir_intrinsic_load_desc_set_address_intel: + push_start = MIN2(push_start, + offsetof(struct anv_push_constants, desc_sets)); + push_end = MAX2(push_end, push_start + + sizeof_field(struct anv_push_constants, desc_sets)); + break; + + default: + break; + } + } + } + } + + const bool has_push_intrinsic = push_start <= push_end; + + const bool push_ubo_ranges = + pdevice->info.verx10 >= 75 && + has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE && + !brw_shader_stage_requires_bindless_resources(nir->info.stage); + + if (push_ubo_ranges && robust_buffer_access) { + /* We can't on-the-fly adjust our push ranges because doing so would + * mess up the layout in the shader. When robustBufferAccess is + * enabled, we push a mask into the shader indicating which pushed + * registers are valid and we zero out the invalid ones at the top of + * the shader. + */ + const uint32_t push_reg_mask_start = + offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]); + const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t); + push_start = MIN2(push_start, push_reg_mask_start); + push_end = MAX2(push_end, push_reg_mask_end); + } + + if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) { + /* For compute shaders, we always have to have the subgroup ID. The + * back-end compiler will "helpfully" add it for us in the last push + * constant slot. Yes, there is an off-by-one error here but that's + * because the back-end will add it so we want to claim the number of + * push constants one dword less than the full amount including + * gl_SubgroupId. + */ + assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id)); + push_end = offsetof(struct anv_push_constants, cs.subgroup_id); + } + + /* Align push_start down to a 32B boundary and make it no larger than + * push_end (no push constants is indicated by push_start = UINT_MAX). + */ + push_start = MIN2(push_start, push_end); + push_start = align_down_u32(push_start, 32); + + /* For vec4 our push data size needs to be aligned to a vec4 and for + * scalar, it needs to be aligned to a DWORD. + */ + const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16; + nir->num_uniforms = ALIGN(push_end - push_start, align); + prog_data->nr_params = nir->num_uniforms / 4; + prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); + + struct anv_push_range push_constant_range = { + .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, + .start = push_start / 32, + .length = DIV_ROUND_UP(push_end - push_start, 32), + }; + + if (has_push_intrinsic) { + nir_foreach_function(function, nir) { + if (!function->impl) + continue; + + nir_builder build, *b = &build; + nir_builder_init(b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_push_constant: { + /* With bindless shaders we load uniforms with SEND + * messages. All the push constants are located after the + * RT_DISPATCH_GLOBALS. We just need to add the offset to + * the address right after RT_DISPATCH_GLOBALS (see + * brw_nir_lower_rt_intrinsics.c). + */ + unsigned base_offset = + brw_shader_stage_requires_bindless_resources(nir->info.stage) ? 0 : push_start; + intrin->intrinsic = nir_intrinsic_load_uniform; + nir_intrinsic_set_base(intrin, + nir_intrinsic_base(intrin) - + base_offset); + break; + } + + case nir_intrinsic_load_desc_set_address_intel: { + b->cursor = nir_before_instr(&intrin->instr); + nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64, + nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)), + .base = offsetof(struct anv_push_constants, desc_sets), + .range = sizeof_field(struct anv_push_constants, desc_sets), + .dest_type = nir_type_uint64); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load); + break; + } + + default: + break; + } + } + } + } + } + + if (push_ubo_ranges) { + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + + /* The vec4 back-end pushes at most 32 regs while the scalar back-end + * pushes up to 64. This is primarily because the scalar back-end has a + * massively more competent register allocator and so the risk of + * spilling due to UBO pushing isn't nearly as high. + */ + const unsigned max_push_regs = + compiler->scalar_stage[nir->info.stage] ? 64 : 32; + + unsigned total_push_regs = push_constant_range.length; + for (unsigned i = 0; i < 4; i++) { + if (total_push_regs + prog_data->ubo_ranges[i].length > max_push_regs) + prog_data->ubo_ranges[i].length = max_push_regs - total_push_regs; + total_push_regs += prog_data->ubo_ranges[i].length; + } + assert(total_push_regs <= max_push_regs); + + int n = 0; + + if (push_constant_range.length > 0) + map->push_ranges[n++] = push_constant_range; + + if (robust_buffer_access) { + const uint32_t push_reg_mask_offset = + offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]); + assert(push_reg_mask_offset >= push_start); + prog_data->push_reg_mask_param = + (push_reg_mask_offset - push_start) / 4; + } + + unsigned range_start_reg = push_constant_range.length; + + for (int i = 0; i < 4; i++) { + struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; + if (ubo_range->length == 0) + continue; + + if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) { + memset(ubo_range, 0, sizeof(*ubo_range)); + continue; + } + + const struct anv_pipeline_binding *binding = + &map->surface_to_descriptor[ubo_range->block]; + + map->push_ranges[n++] = (struct anv_push_range) { + .set = binding->set, + .index = binding->index, + .dynamic_offset_index = binding->dynamic_offset_index, + .start = ubo_range->start, + .length = ubo_range->length, + }; + + /* We only bother to shader-zero pushed client UBOs */ + if (binding->set < MAX_SETS && robust_buffer_access) { + prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg, + ubo_range->length); + } + + range_start_reg += ubo_range->length; + } + } else { + /* For Ivy Bridge, the push constants packets have a different + * rule that would require us to iterate in the other direction + * and possibly mess around with dynamic state base address. + * Don't bother; just emit regular push constants at n = 0. + * + * In the compute case, we don't have multiple push ranges so it's + * better to just provide one in push_ranges[0]. + */ + map->push_ranges[0] = push_constant_range; + } + + /* Now that we're done computing the push constant portion of the + * bind map, hash it. This lets us quickly determine if the actual + * mapping has changed and not just a no-op pipeline change. + */ + _mesa_sha1_compute(map->push_ranges, + sizeof(map->push_ranges), + map->push_sha1); +} + +void +anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data, + struct anv_pipeline_bind_map *map) +{ +#ifndef NDEBUG + unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8); + for (unsigned i = 0; i < 4; i++) + prog_data_push_size += prog_data->ubo_ranges[i].length; + + unsigned bind_map_push_size = 0; + for (unsigned i = 0; i < 4; i++) + bind_map_push_size += map->push_ranges[i].length; + + /* We could go through everything again but it should be enough to assert + * that they push the same number of registers. This should alert us if + * the back-end compiler decides to re-arrange stuff or shrink a range. + */ + assert(prog_data_push_size == bind_map_push_size); +#endif +} diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c b/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c new file mode 100644 index 00000000000..dd591976ac4 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c @@ -0,0 +1,324 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "nir/nir_builder.h" +#include "util/debug.h" + +/** + * This file implements the lowering required for VK_KHR_multiview. + * + * When possible, Primitive Replication is used and the shader is modified to + * make gl_Position an array and fill it with values for each view. + * + * Otherwise we implement multiview using instanced rendering. The number of + * instances in each draw call is multiplied by the number of views in the + * subpass. Then, in the shader, we divide gl_InstanceId by the number of + * views and use gl_InstanceId % view_count to compute the actual ViewIndex. + */ + +struct lower_multiview_state { + nir_builder builder; + + uint32_t view_mask; + + nir_ssa_def *instance_id; + nir_ssa_def *view_index; +}; + +static nir_ssa_def * +build_instance_id(struct lower_multiview_state *state) +{ + assert(state->builder.shader->info.stage == MESA_SHADER_VERTEX); + + if (state->instance_id == NULL) { + nir_builder *b = &state->builder; + + b->cursor = nir_before_block(nir_start_block(b->impl)); + + /* We use instancing for implementing multiview. The actual instance id + * is given by dividing instance_id by the number of views in this + * subpass. + */ + state->instance_id = + nir_idiv(b, nir_load_instance_id(b), + nir_imm_int(b, util_bitcount(state->view_mask))); + } + + return state->instance_id; +} + +static nir_ssa_def * +build_view_index(struct lower_multiview_state *state) +{ + assert(state->builder.shader->info.stage != MESA_SHADER_FRAGMENT); + + if (state->view_index == NULL) { + nir_builder *b = &state->builder; + + b->cursor = nir_before_block(nir_start_block(b->impl)); + + assert(state->view_mask != 0); + if (util_bitcount(state->view_mask) == 1) { + /* Set the view index directly. */ + state->view_index = nir_imm_int(b, ffs(state->view_mask) - 1); + } else if (state->builder.shader->info.stage == MESA_SHADER_VERTEX) { + /* We only support 16 viewports */ + assert((state->view_mask & 0xffff0000) == 0); + + /* We use instancing for implementing multiview. The compacted view + * id is given by instance_id % view_count. We then have to convert + * that to an actual view id. + */ + nir_ssa_def *compacted = + nir_umod(b, nir_load_instance_id(b), + nir_imm_int(b, util_bitcount(state->view_mask))); + + if (util_is_power_of_two_or_zero(state->view_mask + 1)) { + /* If we have a full view mask, then compacted is what we want */ + state->view_index = compacted; + } else { + /* Now we define a map from compacted view index to the actual + * view index that's based on the view_mask. The map is given by + * 16 nibbles, each of which is a value from 0 to 15. + */ + uint64_t remap = 0; + uint32_t i = 0; + u_foreach_bit(bit, state->view_mask) { + assert(bit < 16); + remap |= (uint64_t)bit << (i++ * 4); + } + + nir_ssa_def *shift = nir_imul(b, compacted, nir_imm_int(b, 4)); + + /* One of these days, when we have int64 everywhere, this will be + * easier. + */ + nir_ssa_def *shifted; + if (remap <= UINT32_MAX) { + shifted = nir_ushr(b, nir_imm_int(b, remap), shift); + } else { + nir_ssa_def *shifted_low = + nir_ushr(b, nir_imm_int(b, remap), shift); + nir_ssa_def *shifted_high = + nir_ushr(b, nir_imm_int(b, remap >> 32), + nir_isub(b, shift, nir_imm_int(b, 32))); + shifted = nir_bcsel(b, nir_ilt(b, shift, nir_imm_int(b, 32)), + shifted_low, shifted_high); + } + state->view_index = nir_iand(b, shifted, nir_imm_int(b, 0xf)); + } + } else { + const struct glsl_type *type = glsl_int_type(); + if (b->shader->info.stage == MESA_SHADER_TESS_CTRL || + b->shader->info.stage == MESA_SHADER_GEOMETRY) + type = glsl_array_type(type, 1, 0); + + nir_variable *idx_var = + nir_variable_create(b->shader, nir_var_shader_in, + type, "view index"); + idx_var->data.location = VARYING_SLOT_VIEW_INDEX; + if (b->shader->info.stage == MESA_SHADER_FRAGMENT) + idx_var->data.interpolation = INTERP_MODE_FLAT; + + nir_deref_instr *deref = nir_build_deref_var(b, idx_var); + if (glsl_type_is_array(type)) + deref = nir_build_deref_array_imm(b, deref, 0); + + state->view_index = nir_load_deref(b, deref); + } + } + + return state->view_index; +} + +static bool +is_load_view_index(const nir_instr *instr, const void *data) +{ + return instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_view_index; +} + +static nir_ssa_def * +replace_load_view_index_with_zero(struct nir_builder *b, + nir_instr *instr, void *data) +{ + assert(is_load_view_index(instr, data)); + return nir_imm_zero(b, 1, 32); +} + +static nir_ssa_def * +replace_load_view_index_with_layer_id(struct nir_builder *b, + nir_instr *instr, void *data) +{ + assert(is_load_view_index(instr, data)); + return nir_load_layer_id(b); +} + +bool +anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask, + bool use_primitive_replication) +{ + assert(shader->info.stage != MESA_SHADER_COMPUTE); + + /* If multiview isn't enabled, just lower the ViewIndex builtin to zero. */ + if (view_mask == 0) { + return nir_shader_lower_instructions(shader, is_load_view_index, + replace_load_view_index_with_zero, NULL); + } + + if (shader->info.stage == MESA_SHADER_FRAGMENT) { + return nir_shader_lower_instructions(shader, is_load_view_index, + replace_load_view_index_with_layer_id, NULL); + } + + /* This pass assumes a single entrypoint */ + nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader); + + /* Primitive Replication allows a shader to write different positions for + * each view in the same execution. If only the position depends on the + * view, then it is possible to use the feature instead of instancing to + * implement multiview. + */ + if (use_primitive_replication) { + bool progress = nir_lower_multiview(shader, view_mask); + + if (progress) { + nir_builder b; + nir_builder_init(&b, entrypoint); + b.cursor = nir_before_cf_list(&entrypoint->body); + + /* Fill Layer ID with zero. Replication will use that as base to + * apply the RTAI offsets. + */ + nir_variable *layer_id_out = + nir_variable_create(shader, nir_var_shader_out, + glsl_int_type(), "layer ID"); + layer_id_out->data.location = VARYING_SLOT_LAYER; + nir_store_var(&b, layer_id_out, nir_imm_zero(&b, 1, 32), 0x1); + } + + return progress; + } + + struct lower_multiview_state state = { + .view_mask = view_mask, + }; + + nir_builder_init(&state.builder, entrypoint); + + nir_foreach_block(block, entrypoint) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + + if (load->intrinsic != nir_intrinsic_load_instance_id && + load->intrinsic != nir_intrinsic_load_view_index) + continue; + + assert(load->dest.is_ssa); + + nir_ssa_def *value; + if (load->intrinsic == nir_intrinsic_load_instance_id) { + value = build_instance_id(&state); + } else { + assert(load->intrinsic == nir_intrinsic_load_view_index); + value = build_view_index(&state); + } + + nir_ssa_def_rewrite_uses(&load->dest.ssa, value); + + nir_instr_remove(&load->instr); + } + } + + /* The view index is available in all stages but the instance id is only + * available in the VS. If it's not a fragment shader, we need to pass + * the view index on to the next stage. + */ + nir_ssa_def *view_index = build_view_index(&state); + + nir_builder *b = &state.builder; + + assert(view_index->parent_instr->block == nir_start_block(entrypoint)); + b->cursor = nir_after_instr(view_index->parent_instr); + + /* Unless there is only one possible view index (that would be set + * directly), pass it to the next stage. */ + if (util_bitcount(state.view_mask) != 1) { + nir_variable *view_index_out = + nir_variable_create(shader, nir_var_shader_out, + glsl_int_type(), "view index"); + view_index_out->data.location = VARYING_SLOT_VIEW_INDEX; + nir_store_var(b, view_index_out, view_index, 0x1); + } + + nir_variable *layer_id_out = + nir_variable_create(shader, nir_var_shader_out, + glsl_int_type(), "layer ID"); + layer_id_out->data.location = VARYING_SLOT_LAYER; + nir_store_var(b, layer_id_out, view_index, 0x1); + + nir_metadata_preserve(entrypoint, nir_metadata_block_index | + nir_metadata_dominance); + + return true; +} + +bool +anv_check_for_primitive_replication(struct anv_device *device, + VkShaderStageFlags stages, + nir_shader **shaders, + uint32_t view_mask) +{ + assert(device->info->ver >= 12); + + static int primitive_replication_max_views = -1; + if (primitive_replication_max_views < 0) { + /* TODO: Figure out why we are not getting same benefits for larger than + * 2 views. For now use Primitive Replication just for the 2-view case + * by default. + */ + const unsigned default_max_views = 2; + + primitive_replication_max_views = + MIN2(MAX_VIEWS_FOR_PRIMITIVE_REPLICATION, + env_var_as_unsigned("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS", + default_max_views)); + } + + /* TODO: We should be able to support replication at 'geometry' stages + * later than Vertex. In that case only the last stage can refer to + * gl_ViewIndex. + */ + if (stages & ~(VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT)) + return false; + + int view_count = util_bitcount(view_mask); + if (view_count == 1 || view_count > primitive_replication_max_views) + return false; + + return nir_can_lower_multiview(shaders[MESA_SHADER_VERTEX]); +} diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c new file mode 100644 index 00000000000..5a170352c80 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c @@ -0,0 +1,124 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "nir_builder.h" + +static bool +lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr); + if (load->intrinsic != nir_intrinsic_load_global_constant_offset && + load->intrinsic != nir_intrinsic_load_global_constant_bounded) + return false; + + b->cursor = nir_before_instr(instr); + + nir_ssa_def *base_addr = load->src[0].ssa; + nir_ssa_def *bound = NULL; + if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) + bound = load->src[2].ssa; + + unsigned bit_size = load->dest.ssa.bit_size; + assert(bit_size >= 8 && bit_size % 8 == 0); + unsigned byte_size = bit_size / 8; + + nir_ssa_def *val; + if (nir_src_is_const(load->src[1])) { + uint32_t offset = nir_src_as_uint(load->src[1]); + + /* Things should be component-aligned. */ + assert(offset % byte_size == 0); + + assert(ANV_UBO_ALIGNMENT == 64); + + unsigned suboffset = offset % 64; + uint64_t aligned_offset = offset - suboffset; + + /* Load two just in case we go over a 64B boundary */ + nir_ssa_def *data[2]; + for (unsigned i = 0; i < 2; i++) { + nir_ssa_def *pred; + if (bound) { + pred = nir_ilt(b, nir_imm_int(b, aligned_offset + i * 64 + 63), + bound); + } else { + pred = nir_imm_true(b); + } + + nir_ssa_def *addr = nir_iadd_imm(b, base_addr, + aligned_offset + i * 64); + + data[i] = nir_load_global_const_block_intel(b, 16, addr, pred); + } + + val = nir_extract_bits(b, data, 2, suboffset * 8, + load->num_components, bit_size); + } else { + nir_ssa_def *offset = load->src[1].ssa; + nir_ssa_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset)); + + if (bound) { + nir_ssa_def *zero = nir_imm_zero(b, load->num_components, bit_size); + + unsigned load_size = byte_size * load->num_components; + nir_ssa_def *in_bounds = + nir_ilt(b, nir_iadd_imm(b, offset, load_size - 1), bound); + + nir_push_if(b, in_bounds); + + nir_ssa_def *load_val = + nir_build_load_global_constant(b, load->dest.ssa.num_components, + load->dest.ssa.bit_size, addr, + .access = nir_intrinsic_access(load), + .align_mul = nir_intrinsic_align_mul(load), + .align_offset = nir_intrinsic_align_offset(load)); + + nir_pop_if(b, NULL); + + val = nir_if_phi(b, load_val, zero); + } else { + val = nir_build_load_global_constant(b, load->dest.ssa.num_components, + load->dest.ssa.bit_size, addr, + .access = nir_intrinsic_access(load), + .align_mul = nir_intrinsic_align_mul(load), + .align_offset = nir_intrinsic_align_offset(load)); + } + } + + nir_ssa_def_rewrite_uses(&load->dest.ssa, val); + nir_instr_remove(&load->instr); + + return true; +} + +bool +anv_nir_lower_ubo_loads(nir_shader *shader) +{ + return nir_shader_instructions_pass(shader, lower_ubo_load_instr, + nir_metadata_none, + NULL); +} diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c new file mode 100644 index 00000000000..e82cd032e20 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c @@ -0,0 +1,349 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_nir.h" +#include "anv_private.h" +#include "nir/nir.h" +#include "nir/nir_builder.h" +#include "nir/nir_vulkan.h" + +struct ycbcr_state { + nir_builder *builder; + nir_ssa_def *image_size; + nir_tex_instr *origin_tex; + nir_deref_instr *tex_deref; + struct anv_ycbcr_conversion *conversion; +}; + +/* TODO: we should probably replace this with a push constant/uniform. */ +static nir_ssa_def * +get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture) +{ + if (state->image_size) + return state->image_size; + + nir_builder *b = state->builder; + const struct glsl_type *type = texture->type; + nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1); + + tex->op = nir_texop_txs; + tex->sampler_dim = glsl_get_sampler_dim(type); + tex->is_array = glsl_sampler_type_is_array(type); + tex->is_shadow = glsl_sampler_type_is_shadow(type); + tex->dest_type = nir_type_int32; + + tex->src[0].src_type = nir_tex_src_texture_deref; + tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa); + + nir_ssa_dest_init(&tex->instr, &tex->dest, + nir_tex_instr_dest_size(tex), 32, NULL); + nir_builder_instr_insert(b, &tex->instr); + + state->image_size = nir_i2f32(b, &tex->dest.ssa); + + return state->image_size; +} + +static nir_ssa_def * +implicit_downsampled_coord(nir_builder *b, + nir_ssa_def *value, + nir_ssa_def *max_value, + int div_scale) +{ + return nir_fadd(b, + value, + nir_fdiv(b, + nir_imm_float(b, 1.0f), + nir_fmul(b, + nir_imm_float(b, div_scale), + max_value))); +} + +static nir_ssa_def * +implicit_downsampled_coords(struct ycbcr_state *state, + nir_ssa_def *old_coords, + const struct anv_format_plane *plane_format) +{ + nir_builder *b = state->builder; + struct anv_ycbcr_conversion *conversion = state->conversion; + nir_ssa_def *image_size = get_texture_size(state, state->tex_deref); + nir_ssa_def *comp[4] = { NULL, }; + int c; + + for (c = 0; c < ARRAY_SIZE(conversion->chroma_offsets); c++) { + if (plane_format->denominator_scales[c] > 1 && + conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) { + comp[c] = implicit_downsampled_coord(b, + nir_channel(b, old_coords, c), + nir_channel(b, image_size, c), + plane_format->denominator_scales[c]); + } else { + comp[c] = nir_channel(b, old_coords, c); + } + } + + /* Leave other coordinates untouched */ + for (; c < old_coords->num_components; c++) + comp[c] = nir_channel(b, old_coords, c); + + return nir_vec(b, comp, old_coords->num_components); +} + +static nir_ssa_def * +create_plane_tex_instr_implicit(struct ycbcr_state *state, + uint32_t plane) +{ + nir_builder *b = state->builder; + struct anv_ycbcr_conversion *conversion = state->conversion; + const struct anv_format_plane *plane_format = + &conversion->format->planes[plane]; + nir_tex_instr *old_tex = state->origin_tex; + nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs + 1); + + for (uint32_t i = 0; i < old_tex->num_srcs; i++) { + tex->src[i].src_type = old_tex->src[i].src_type; + + switch (old_tex->src[i].src_type) { + case nir_tex_src_coord: + if (plane_format->has_chroma && conversion->chroma_reconstruction) { + assert(old_tex->src[i].src.is_ssa); + tex->src[i].src = + nir_src_for_ssa(implicit_downsampled_coords(state, + old_tex->src[i].src.ssa, + plane_format)); + break; + } + FALLTHROUGH; + default: + nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, &tex->instr); + break; + } + } + tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane)); + tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane; + + tex->sampler_dim = old_tex->sampler_dim; + tex->dest_type = old_tex->dest_type; + + tex->op = old_tex->op; + tex->coord_components = old_tex->coord_components; + tex->is_new_style_shadow = old_tex->is_new_style_shadow; + tex->component = old_tex->component; + + tex->texture_index = old_tex->texture_index; + tex->sampler_index = old_tex->sampler_index; + tex->is_array = old_tex->is_array; + + nir_ssa_dest_init(&tex->instr, &tex->dest, + old_tex->dest.ssa.num_components, + nir_dest_bit_size(old_tex->dest), NULL); + nir_builder_instr_insert(b, &tex->instr); + + return &tex->dest.ssa; +} + +static unsigned +channel_to_component(enum isl_channel_select channel) +{ + switch (channel) { + case ISL_CHANNEL_SELECT_RED: + return 0; + case ISL_CHANNEL_SELECT_GREEN: + return 1; + case ISL_CHANNEL_SELECT_BLUE: + return 2; + case ISL_CHANNEL_SELECT_ALPHA: + return 3; + default: + unreachable("invalid channel"); + return 0; + } +} + +static enum isl_channel_select +swizzle_channel(struct isl_swizzle swizzle, unsigned channel) +{ + switch (channel) { + case 0: + return swizzle.r; + case 1: + return swizzle.g; + case 2: + return swizzle.b; + case 3: + return swizzle.a; + default: + unreachable("invalid channel"); + return 0; + } +} + +static bool +anv_nir_lower_ycbcr_textures_instr(nir_builder *builder, + nir_instr *instr, + void *cb_data) +{ + const struct anv_pipeline_layout *layout = cb_data; + + if (instr->type != nir_instr_type_tex) + return false; + + nir_tex_instr *tex = nir_instr_as_tex(instr); + + int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref); + assert(deref_src_idx >= 0); + nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src); + + nir_variable *var = nir_deref_instr_get_variable(deref); + const struct anv_descriptor_set_layout *set_layout = + layout->set[var->data.descriptor_set].layout; + const struct anv_descriptor_set_binding_layout *binding = + &set_layout->binding[var->data.binding]; + + /* For the following instructions, we don't apply any change and let the + * instruction apply to the first plane. + */ + if (tex->op == nir_texop_txs || + tex->op == nir_texop_query_levels || + tex->op == nir_texop_lod) + return false; + + if (binding->immutable_samplers == NULL) + return false; + + assert(tex->texture_index == 0); + unsigned array_index = 0; + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + if (!nir_src_is_const(deref->arr.index)) + return false; + array_index = nir_src_as_uint(deref->arr.index); + array_index = MIN2(array_index, binding->array_size - 1); + } + const struct anv_sampler *sampler = binding->immutable_samplers[array_index]; + + if (sampler->conversion == NULL) + return false; + + struct ycbcr_state state = { + .builder = builder, + .origin_tex = tex, + .tex_deref = deref, + .conversion = sampler->conversion, + }; + + builder->cursor = nir_before_instr(&tex->instr); + + const struct anv_format *format = state.conversion->format; + const struct isl_format_layout *y_isl_layout = NULL; + for (uint32_t p = 0; p < format->n_planes; p++) { + if (!format->planes[p].has_chroma) + y_isl_layout = isl_format_get_layout(format->planes[p].isl_format); + } + assert(y_isl_layout != NULL); + uint8_t y_bpc = y_isl_layout->channels_array[0].bits; + + /* |ycbcr_comp| holds components in the order : Cr-Y-Cb */ + nir_ssa_def *zero = nir_imm_float(builder, 0.0f); + nir_ssa_def *one = nir_imm_float(builder, 1.0f); + /* Use extra 2 channels for following swizzle */ + nir_ssa_def *ycbcr_comp[5] = { zero, zero, zero, one, zero }; + + uint8_t ycbcr_bpcs[5]; + memset(ycbcr_bpcs, y_bpc, sizeof(ycbcr_bpcs)); + + /* Go through all the planes and gather the samples into a |ycbcr_comp| + * while applying a swizzle required by the spec: + * + * R, G, B should respectively map to Cr, Y, Cb + */ + for (uint32_t p = 0; p < format->n_planes; p++) { + const struct anv_format_plane *plane_format = &format->planes[p]; + nir_ssa_def *plane_sample = create_plane_tex_instr_implicit(&state, p); + + for (uint32_t pc = 0; pc < 4; pc++) { + enum isl_channel_select ycbcr_swizzle = + swizzle_channel(plane_format->ycbcr_swizzle, pc); + if (ycbcr_swizzle == ISL_CHANNEL_SELECT_ZERO) + continue; + + unsigned ycbcr_component = channel_to_component(ycbcr_swizzle); + ycbcr_comp[ycbcr_component] = nir_channel(builder, plane_sample, pc); + + /* Also compute the number of bits for each component. */ + const struct isl_format_layout *isl_layout = + isl_format_get_layout(plane_format->isl_format); + ycbcr_bpcs[ycbcr_component] = isl_layout->channels_array[pc].bits; + } + } + + /* Now remaps components to the order specified by the conversion. */ + nir_ssa_def *swizzled_comp[4] = { NULL, }; + uint32_t swizzled_bpcs[4] = { 0, }; + + for (uint32_t i = 0; i < ARRAY_SIZE(state.conversion->mapping); i++) { + /* Maps to components in |ycbcr_comp| */ + static const uint32_t swizzle_mapping[] = { + [VK_COMPONENT_SWIZZLE_ZERO] = 4, + [VK_COMPONENT_SWIZZLE_ONE] = 3, + [VK_COMPONENT_SWIZZLE_R] = 0, + [VK_COMPONENT_SWIZZLE_G] = 1, + [VK_COMPONENT_SWIZZLE_B] = 2, + [VK_COMPONENT_SWIZZLE_A] = 3, + }; + const VkComponentSwizzle m = state.conversion->mapping[i]; + + if (m == VK_COMPONENT_SWIZZLE_IDENTITY) { + swizzled_comp[i] = ycbcr_comp[i]; + swizzled_bpcs[i] = ycbcr_bpcs[i]; + } else { + swizzled_comp[i] = ycbcr_comp[swizzle_mapping[m]]; + swizzled_bpcs[i] = ycbcr_bpcs[swizzle_mapping[m]]; + } + } + + nir_ssa_def *result = nir_vec(builder, swizzled_comp, 4); + if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) { + result = nir_convert_ycbcr_to_rgb(builder, + state.conversion->ycbcr_model, + state.conversion->ycbcr_range, + result, + swizzled_bpcs); + } + + nir_ssa_def_rewrite_uses(&tex->dest.ssa, result); + nir_instr_remove(&tex->instr); + + return true; +} + +bool +anv_nir_lower_ycbcr_textures(nir_shader *shader, + const struct anv_pipeline_layout *layout) +{ + return nir_shader_instructions_pass(shader, + anv_nir_lower_ycbcr_textures_instr, + nir_metadata_block_index | + nir_metadata_dominance, + (void *)layout); +} diff --git a/src/intel/vulkan_hasvk/anv_perf.c b/src/intel/vulkan_hasvk/anv_perf.c new file mode 100644 index 00000000000..36c4c30e381 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_perf.c @@ -0,0 +1,488 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include "anv_private.h" +#include "vk_util.h" + +#include "perf/intel_perf.h" +#include "perf/intel_perf_mdapi.h" + +#include "util/mesa-sha1.h" + +void +anv_physical_device_init_perf(struct anv_physical_device *device, int fd) +{ + const struct intel_device_info *devinfo = &device->info; + + device->perf = NULL; + + /* We need self modifying batches. The i915 parser prevents it on + * Gfx7.5 :( maybe one day. + */ + if (devinfo->ver < 8) + return; + + struct intel_perf_config *perf = intel_perf_new(NULL); + + intel_perf_init_metrics(perf, &device->info, fd, + false /* pipeline statistics */, + true /* register snapshots */); + + if (!perf->n_queries) + goto err; + + /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in + * perf revision 2. + */ + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { + if (!intel_perf_has_hold_preemption(perf)) + goto err; + } + + device->perf = perf; + + /* Compute the number of commands we need to implement a performance + * query. + */ + const struct intel_perf_query_field_layout *layout = &perf->query_layout; + device->n_perf_query_commands = 0; + for (uint32_t f = 0; f < layout->n_fields; f++) { + struct intel_perf_query_field *field = &layout->fields[f]; + + switch (field->type) { + case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: + device->n_perf_query_commands++; + break; + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: + device->n_perf_query_commands += field->size / 4; + break; + default: + unreachable("Unhandled register type"); + } + } + device->n_perf_query_commands *= 2; /* Begin & End */ + device->n_perf_query_commands += 1; /* availability */ + + return; + + err: + ralloc_free(perf); +} + +void +anv_device_perf_init(struct anv_device *device) +{ + device->perf_fd = -1; +} + +static int +anv_device_perf_open(struct anv_device *device, uint64_t metric_id) +{ + uint64_t properties[DRM_I915_PERF_PROP_MAX * 2]; + struct drm_i915_perf_open_param param; + int p = 0, stream_fd; + + properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA; + properties[p++] = true; + + properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET; + properties[p++] = metric_id; + + properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT; + properties[p++] = device->info->ver >= 8 ? + I915_OA_FORMAT_A32u40_A4u32_B8_C8 : + I915_OA_FORMAT_A45_B8_C8; + + properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT; + properties[p++] = 31; /* slowest sampling period */ + + properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE; + properties[p++] = device->context_id; + + properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION; + properties[p++] = true; + + /* If global SSEU is available, pin it to the default. This will ensure on + * Gfx11 for instance we use the full EU array. Initially when perf was + * enabled we would use only half on Gfx11 because of functional + * requirements. + * + * Temporary disable this option on Gfx12.5+, kernel doesn't appear to + * support it. + */ + if (intel_perf_has_global_sseu(device->physical->perf) && + device->info->verx10 < 125) { + properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU; + properties[p++] = (uintptr_t) &device->physical->perf->sseu; + } + + memset(¶m, 0, sizeof(param)); + param.flags = 0; + param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK; + param.properties_ptr = (uintptr_t)properties; + param.num_properties = p / 2; + + stream_fd = intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); + return stream_fd; +} + +/* VK_INTEL_performance_query */ +VkResult anv_InitializePerformanceApiINTEL( + VkDevice _device, + const VkInitializePerformanceApiInfoINTEL* pInitializeInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + /* Not much to do here */ + return VK_SUCCESS; +} + +VkResult anv_GetPerformanceParameterINTEL( + VkDevice _device, + VkPerformanceParameterTypeINTEL parameter, + VkPerformanceValueINTEL* pValue) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!device->physical->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + VkResult result = VK_SUCCESS; + switch (parameter) { + case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL; + pValue->data.valueBool = VK_TRUE; + break; + + case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL; + pValue->data.value32 = 25; + break; + + default: + result = VK_ERROR_FEATURE_NOT_PRESENT; + break; + } + + return result; +} + +VkResult anv_CmdSetPerformanceMarkerINTEL( + VkCommandBuffer commandBuffer, + const VkPerformanceMarkerInfoINTEL* pMarkerInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->intel_perf_marker = pMarkerInfo->marker; + + return VK_SUCCESS; +} + +VkResult anv_AcquirePerformanceConfigurationINTEL( + VkDevice _device, + const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo, + VkPerformanceConfigurationINTEL* pConfiguration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_performance_configuration_intel *config; + + config = vk_object_alloc(&device->vk, NULL, sizeof(*config), + VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL); + if (!config) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { + config->register_config = + intel_perf_load_configuration(device->physical->perf, device->fd, + INTEL_PERF_QUERY_GUID_MDAPI); + if (!config->register_config) { + vk_object_free(&device->vk, NULL, config); + return VK_INCOMPLETE; + } + + int ret = + intel_perf_store_configuration(device->physical->perf, device->fd, + config->register_config, NULL /* guid */); + if (ret < 0) { + ralloc_free(config->register_config); + vk_object_free(&device->vk, NULL, config); + return VK_INCOMPLETE; + } + + config->config_id = ret; + } + + *pConfiguration = anv_performance_configuration_intel_to_handle(config); + + return VK_SUCCESS; +} + +VkResult anv_ReleasePerformanceConfigurationINTEL( + VkDevice _device, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration); + + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) + intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config->config_id); + + ralloc_free(config->register_config); + + vk_object_free(&device->vk, NULL, config); + + return VK_SUCCESS; +} + +VkResult anv_QueueSetPerformanceConfigurationINTEL( + VkQueue _queue, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_queue, queue, _queue); + ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration); + struct anv_device *device = queue->device; + + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { + if (device->perf_fd < 0) { + device->perf_fd = anv_device_perf_open(device, config->config_id); + if (device->perf_fd < 0) + return VK_ERROR_INITIALIZATION_FAILED; + } else { + int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) config->config_id); + if (ret < 0) + return vk_device_set_lost(&device->vk, "i915-perf config failed: %m"); + } + } + + return VK_SUCCESS; +} + +void anv_UninitializePerformanceApiINTEL( + VkDevice _device) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (device->perf_fd >= 0) { + close(device->perf_fd); + device->perf_fd = -1; + } +} + +/* VK_KHR_performance_query */ +static const VkPerformanceCounterUnitKHR +intel_perf_counter_unit_to_vk_unit[] = { + [INTEL_PERF_COUNTER_UNITS_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, + [INTEL_PERF_COUNTER_UNITS_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, + [INTEL_PERF_COUNTER_UNITS_NS] = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, + [INTEL_PERF_COUNTER_UNITS_US] = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, /* todo */ + [INTEL_PERF_COUNTER_UNITS_PIXELS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_TEXELS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_THREADS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_PERCENT] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, + [INTEL_PERF_COUNTER_UNITS_MESSAGES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_NUMBER] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_CYCLES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_EVENTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_UTILIZATION] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, +}; + +static const VkPerformanceCounterStorageKHR +intel_perf_counter_data_type_to_vk_storage[] = { + [INTEL_PERF_COUNTER_DATA_TYPE_BOOL32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, + [INTEL_PERF_COUNTER_DATA_TYPE_UINT32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, + [INTEL_PERF_COUNTER_DATA_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, + [INTEL_PERF_COUNTER_DATA_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, + [INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR, +}; + +VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + uint32_t* pCounterCount, + VkPerformanceCounterKHR* pCounters, + VkPerformanceCounterDescriptionKHR* pCounterDescriptions) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + struct intel_perf_config *perf = pdevice->perf; + + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc, + pCounterDescriptions, &desc_count); + + /* We cannot support performance queries on anything other than RCS, + * because the MI_REPORT_PERF_COUNT command is not available on other + * engines. + */ + struct anv_queue_family *queue_family = + &pdevice->queue.families[queueFamilyIndex]; + if (queue_family->engine_class != I915_ENGINE_CLASS_RENDER) + return vk_outarray_status(&out); + + for (int c = 0; c < (perf ? perf->n_counters : 0); c++) { + const struct intel_perf_query_counter *intel_counter = perf->counter_infos[c].counter; + + vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { + counter->unit = intel_perf_counter_unit_to_vk_unit[intel_counter->units]; + counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; + counter->storage = intel_perf_counter_data_type_to_vk_storage[intel_counter->data_type]; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(intel_counter->symbol_name, + strlen(intel_counter->symbol_name), + sha1_result); + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) { + desc->flags = 0; /* None so far. */ + snprintf(desc->name, sizeof(desc->name), "%s", intel_counter->name); + snprintf(desc->category, sizeof(desc->category), "%s", intel_counter->category); + snprintf(desc->description, sizeof(desc->description), "%s", intel_counter->desc); + } + } + + return vk_outarray_status(&out); +} + +void anv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( + VkPhysicalDevice physicalDevice, + const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, + uint32_t* pNumPasses) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + struct intel_perf_config *perf = pdevice->perf; + + if (!perf) { + *pNumPasses = 0; + return; + } + + *pNumPasses = intel_perf_get_n_passes(perf, + pPerformanceQueryCreateInfo->pCounterIndices, + pPerformanceQueryCreateInfo->counterIndexCount, + NULL); +} + +VkResult anv_AcquireProfilingLockKHR( + VkDevice _device, + const VkAcquireProfilingLockInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct intel_perf_config *perf = device->physical->perf; + struct intel_perf_query_info *first_metric_set = &perf->queries[0]; + int fd = -1; + + assert(device->perf_fd == -1); + + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { + fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id); + if (fd < 0) + return VK_TIMEOUT; + } + + device->perf_fd = fd; + return VK_SUCCESS; +} + +void anv_ReleaseProfilingLockKHR( + VkDevice _device) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { + assert(device->perf_fd >= 0); + close(device->perf_fd); + } + device->perf_fd = -1; +} + +void +anv_perf_write_pass_results(struct intel_perf_config *perf, + struct anv_query_pool *pool, uint32_t pass, + const struct intel_perf_query_result *accumulated_results, + union VkPerformanceCounterResultKHR *results) +{ + for (uint32_t c = 0; c < pool->n_counters; c++) { + const struct intel_perf_counter_pass *counter_pass = &pool->counter_pass[c]; + + if (counter_pass->pass != pass) + continue; + + switch (pool->pass_query[pass]->kind) { + case INTEL_PERF_QUERY_TYPE_PIPELINE: { + assert(counter_pass->counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64); + uint32_t accu_offset = counter_pass->counter->offset / sizeof(uint64_t); + results[c].uint64 = accumulated_results->accumulator[accu_offset]; + break; + } + + case INTEL_PERF_QUERY_TYPE_OA: + case INTEL_PERF_QUERY_TYPE_RAW: + switch (counter_pass->counter->data_type) { + case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: + results[c].uint64 = + counter_pass->counter->oa_counter_read_uint64(perf, + counter_pass->query, + accumulated_results); + break; + case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: + results[c].float32 = + counter_pass->counter->oa_counter_read_float(perf, + counter_pass->query, + accumulated_results); + break; + default: + /* So far we aren't using uint32, double or bool32... */ + unreachable("unexpected counter data type"); + } + break; + + default: + unreachable("invalid query type"); + } + + /* The Vulkan extension only has nanoseconds as a unit */ + if (counter_pass->counter->units == INTEL_PERF_COUNTER_UNITS_US) { + assert(counter_pass->counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64); + results[c].uint64 *= 1000; + } + } +} diff --git a/src/intel/vulkan_hasvk/anv_pipeline.c b/src/intel/vulkan_hasvk/anv_pipeline.c new file mode 100644 index 00000000000..1765b33070d --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_pipeline.c @@ -0,0 +1,3300 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "util/mesa-sha1.h" +#include "util/os_time.h" +#include "common/intel_l3_config.h" +#include "common/intel_disasm.h" +#include "common/intel_sample_positions.h" +#include "anv_private.h" +#include "compiler/brw_nir.h" +#include "compiler/brw_nir_rt.h" +#include "anv_nir.h" +#include "nir/nir_xfb_info.h" +#include "spirv/nir_spirv.h" +#include "vk_pipeline.h" +#include "vk_render_pass.h" +#include "vk_util.h" + +/* Needed for SWIZZLE macros */ +#include "program/prog_instruction.h" + +/* Eventually, this will become part of anv_CreateShader. Unfortunately, + * we can't do that yet because we don't have the ability to copy nir. + */ +static nir_shader * +anv_shader_stage_to_nir(struct anv_device *device, + const VkPipelineShaderStageCreateInfo *stage_info, + void *mem_ctx) +{ + const struct anv_physical_device *pdevice = device->physical; + const struct anv_instance *instance = pdevice->instance; + const struct brw_compiler *compiler = pdevice->compiler; + gl_shader_stage stage = vk_to_mesa_shader_stage(stage_info->stage); + const nir_shader_compiler_options *nir_options = + compiler->nir_options[stage]; + + const struct spirv_to_nir_options spirv_options = { + .caps = { + .demote_to_helper_invocation = true, + .derivative_group = true, + .descriptor_array_dynamic_indexing = true, + .descriptor_array_non_uniform_indexing = true, + .descriptor_indexing = true, + .device_group = true, + .draw_parameters = true, + .float16 = pdevice->info.ver >= 8, + .float32_atomic_add = pdevice->info.has_lsc, + .float32_atomic_min_max = pdevice->info.ver >= 9, + .float64 = pdevice->info.ver >= 8, + .float64_atomic_min_max = pdevice->info.has_lsc, + .fragment_shader_sample_interlock = pdevice->info.ver >= 9, + .fragment_shader_pixel_interlock = pdevice->info.ver >= 9, + .geometry_streams = true, + /* When using Vulkan 1.3 or KHR_format_feature_flags2 is enabled, the + * read/write without format is per format, so just report true. It's + * up to the application to check. + */ + .image_read_without_format = instance->vk.app_info.api_version >= VK_API_VERSION_1_3 || device->vk.enabled_extensions.KHR_format_feature_flags2, + .image_write_without_format = true, + .int8 = pdevice->info.ver >= 8, + .int16 = pdevice->info.ver >= 8, + .int64 = pdevice->info.ver >= 8, + .int64_atomics = pdevice->info.ver >= 9 && pdevice->use_softpin, + .integer_functions2 = pdevice->info.ver >= 8, + .mesh_shading_nv = pdevice->vk.supported_extensions.NV_mesh_shader, + .min_lod = true, + .multiview = true, + .physical_storage_buffer_address = pdevice->has_a64_buffer_access, + .post_depth_coverage = pdevice->info.ver >= 9, + .runtime_descriptor_array = true, + .float_controls = pdevice->info.ver >= 8, + .ray_query = pdevice->info.has_ray_tracing, + .ray_tracing = pdevice->info.has_ray_tracing, + .shader_clock = true, + .shader_viewport_index_layer = true, + .stencil_export = pdevice->info.ver >= 9, + .storage_8bit = pdevice->info.ver >= 8, + .storage_16bit = pdevice->info.ver >= 8, + .subgroup_arithmetic = true, + .subgroup_basic = true, + .subgroup_ballot = true, + .subgroup_dispatch = true, + .subgroup_quad = true, + .subgroup_uniform_control_flow = true, + .subgroup_shuffle = true, + .subgroup_vote = true, + .tessellation = true, + .transform_feedback = pdevice->info.ver >= 8, + .variable_pointers = true, + .vk_memory_model = true, + .vk_memory_model_device_scope = true, + .workgroup_memory_explicit_layout = true, + .fragment_shading_rate = pdevice->info.ver >= 11, + }, + .ubo_addr_format = + anv_nir_ubo_addr_format(pdevice, device->robust_buffer_access), + .ssbo_addr_format = + anv_nir_ssbo_addr_format(pdevice, device->robust_buffer_access), + .phys_ssbo_addr_format = nir_address_format_64bit_global, + .push_const_addr_format = nir_address_format_logical, + + /* TODO: Consider changing this to an address format that has the NULL + * pointer equals to 0. That might be a better format to play nice + * with certain code / code generators. + */ + .shared_addr_format = nir_address_format_32bit_offset, + }; + + nir_shader *nir; + VkResult result = + vk_pipeline_shader_stage_to_nir(&device->vk, stage_info, + &spirv_options, nir_options, + mem_ctx, &nir); + if (result != VK_SUCCESS) + return NULL; + + if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) { + fprintf(stderr, "NIR (from SPIR-V) for %s shader:\n", + gl_shader_stage_name(stage)); + nir_print_shader(nir, stderr); + } + + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, false); + + const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = { + .point_coord = true, + }; + NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings); + + const nir_opt_access_options opt_access_options = { + .is_vulkan = true, + .infer_non_readable = true, + }; + NIR_PASS(_, nir, nir_opt_access, &opt_access_options); + + NIR_PASS(_, nir, nir_lower_frexp); + + /* Vulkan uses the separate-shader linking model */ + nir->info.separate_shader = true; + + brw_preprocess_nir(compiler, nir, NULL); + + return nir; +} + +VkResult +anv_pipeline_init(struct anv_pipeline *pipeline, + struct anv_device *device, + enum anv_pipeline_type type, + VkPipelineCreateFlags flags, + const VkAllocationCallbacks *pAllocator) +{ + VkResult result; + + memset(pipeline, 0, sizeof(*pipeline)); + + vk_object_base_init(&device->vk, &pipeline->base, + VK_OBJECT_TYPE_PIPELINE); + pipeline->device = device; + + /* It's the job of the child class to provide actual backing storage for + * the batch by setting batch.start, batch.next, and batch.end. + */ + pipeline->batch.alloc = pAllocator ? pAllocator : &device->vk.alloc; + pipeline->batch.relocs = &pipeline->batch_relocs; + pipeline->batch.status = VK_SUCCESS; + + result = anv_reloc_list_init(&pipeline->batch_relocs, + pipeline->batch.alloc); + if (result != VK_SUCCESS) + return result; + + pipeline->mem_ctx = ralloc_context(NULL); + + pipeline->type = type; + pipeline->flags = flags; + + util_dynarray_init(&pipeline->executables, pipeline->mem_ctx); + + return VK_SUCCESS; +} + +void +anv_pipeline_finish(struct anv_pipeline *pipeline, + struct anv_device *device, + const VkAllocationCallbacks *pAllocator) +{ + anv_reloc_list_finish(&pipeline->batch_relocs, + pAllocator ? pAllocator : &device->vk.alloc); + ralloc_free(pipeline->mem_ctx); + vk_object_base_finish(&pipeline->base); +} + +void anv_DestroyPipeline( + VkDevice _device, + VkPipeline _pipeline, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + + if (!pipeline) + return; + + switch (pipeline->type) { + case ANV_PIPELINE_GRAPHICS: { + struct anv_graphics_pipeline *gfx_pipeline = + anv_pipeline_to_graphics(pipeline); + + for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->shaders); s++) { + if (gfx_pipeline->shaders[s]) + anv_shader_bin_unref(device, gfx_pipeline->shaders[s]); + } + break; + } + + case ANV_PIPELINE_COMPUTE: { + struct anv_compute_pipeline *compute_pipeline = + anv_pipeline_to_compute(pipeline); + + if (compute_pipeline->cs) + anv_shader_bin_unref(device, compute_pipeline->cs); + + break; + } + + case ANV_PIPELINE_RAY_TRACING: { + struct anv_ray_tracing_pipeline *rt_pipeline = + anv_pipeline_to_ray_tracing(pipeline); + + util_dynarray_foreach(&rt_pipeline->shaders, + struct anv_shader_bin *, shader) { + anv_shader_bin_unref(device, *shader); + } + break; + } + + default: + unreachable("invalid pipeline type"); + } + + anv_pipeline_finish(pipeline, device, pAllocator); + vk_free2(&device->vk.alloc, pAllocator, pipeline); +} + +static void +populate_sampler_prog_key(const struct intel_device_info *devinfo, + struct brw_sampler_prog_key_data *key) +{ + /* Almost all multisampled textures are compressed. The only time when we + * don't compress a multisampled texture is for 16x MSAA with a surface + * width greater than 8k which is a bit of an edge case. Since the sampler + * just ignores the MCS parameter to ld2ms when MCS is disabled, it's safe + * to tell the compiler to always assume compression. + */ + key->compressed_multisample_layout_mask = ~0; + + /* SkyLake added support for 16x MSAA. With this came a new message for + * reading from a 16x MSAA surface with compression. The new message was + * needed because now the MCS data is 64 bits instead of 32 or lower as is + * the case for 8x, 4x, and 2x. The key->msaa_16 bit-field controls which + * message we use. Fortunately, the 16x message works for 8x, 4x, and 2x + * so we can just use it unconditionally. This may not be quite as + * efficient but it saves us from recompiling. + */ + if (devinfo->ver >= 9) + key->msaa_16 = ~0; + + /* XXX: Handle texture swizzle on HSW- */ + for (int i = 0; i < BRW_MAX_SAMPLERS; i++) { + /* Assume color sampler, no swizzling. (Works for BDW+) */ + key->swizzles[i] = SWIZZLE_XYZW; + } +} + +static void +populate_base_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + struct brw_base_prog_key *key) +{ + key->robust_buffer_access = robust_buffer_acccess; + key->limit_trig_input_range = + device->physical->instance->limit_trig_input_range; + + populate_sampler_prog_key(device->info, &key->tex); +} + +static void +populate_vs_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + struct brw_vs_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); + + /* XXX: Handle vertex input work-arounds */ + + /* XXX: Handle sampler_prog_key */ +} + +static void +populate_tcs_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + unsigned input_vertices, + struct brw_tcs_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); + + key->input_vertices = input_vertices; +} + +static void +populate_tes_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + struct brw_tes_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); +} + +static void +populate_gs_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + struct brw_gs_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); +} + +static bool +pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline, + const BITSET_WORD *dynamic, + const struct vk_multisample_state *ms, + const struct vk_fragment_shading_rate_state *fsr) +{ + /* The Vulkan 1.2.199 spec says: + * + * "If any of the following conditions are met, Cxy' must be set to + * {1,1}: + * + * * If Sample Shading is enabled. + * * [...]" + * + * And "sample shading" is defined as follows: + * + * "Sample shading is enabled for a graphics pipeline: + * + * * If the interface of the fragment shader entry point of the + * graphics pipeline includes an input variable decorated with + * SampleId or SamplePosition. In this case minSampleShadingFactor + * takes the value 1.0. + * + * * Else if the sampleShadingEnable member of the + * VkPipelineMultisampleStateCreateInfo structure specified when + * creating the graphics pipeline is set to VK_TRUE. In this case + * minSampleShadingFactor takes the value of + * VkPipelineMultisampleStateCreateInfo::minSampleShading. + * + * Otherwise, sample shading is considered disabled." + * + * The first bullet above is handled by the back-end compiler because those + * inputs both force per-sample dispatch. The second bullet is handled + * here. Note that this sample shading being enabled has nothing to do + * with minSampleShading. + */ + if (ms != NULL && ms->sample_shading_enable) + return false; + + /* Not dynamic & pipeline has a 1x1 fragment shading rate with no + * possibility for element of the pipeline to change the value. + */ + if (!BITSET_TEST(dynamic, MESA_VK_DYNAMIC_FSR) && + fsr->fragment_size.width <= 1 && + fsr->fragment_size.height <= 1 && + fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR && + fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) + return false; + + return true; +} + +static void +populate_task_prog_key(const struct anv_device *device, + bool robust_buffer_access, + struct brw_task_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_access, &key->base); +} + +static void +populate_mesh_prog_key(const struct anv_device *device, + bool robust_buffer_access, + struct brw_mesh_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_access, &key->base); +} + +static void +populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, + bool robust_buffer_acccess, + const BITSET_WORD *dynamic, + const struct vk_multisample_state *ms, + const struct vk_fragment_shading_rate_state *fsr, + const struct vk_render_pass_state *rp, + struct brw_wm_prog_key *key) +{ + const struct anv_device *device = pipeline->base.device; + + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); + + /* We set this to 0 here and set to the actual value before we call + * brw_compile_fs. + */ + key->input_slots_valid = 0; + + /* XXX Vulkan doesn't appear to specify */ + key->clamp_fragment_color = false; + + key->ignore_sample_mask_out = false; + + assert(rp->color_attachment_count <= MAX_RTS); + /* Consider all inputs as valid until look at the NIR variables. */ + key->color_outputs_valid = (1u << rp->color_attachment_count) - 1; + key->nr_color_regions = rp->color_attachment_count; + + /* To reduce possible shader recompilations we would need to know if + * there is a SampleMask output variable to compute if we should emit + * code to workaround the issue that hardware disables alpha to coverage + * when there is SampleMask output. + */ + key->alpha_to_coverage = ms != NULL && ms->alpha_to_coverage_enable; + + /* Vulkan doesn't support fixed-function alpha test */ + key->alpha_test_replicate_alpha = false; + + if (ms != NULL) { + /* We should probably pull this out of the shader, but it's fairly + * harmless to compute it and then let dead-code take care of it. + */ + if (ms->rasterization_samples > 1) { + key->persample_interp = ms->sample_shading_enable && + (ms->min_sample_shading * ms->rasterization_samples) > 1; + key->multisample_fbo = true; + } + + if (device->physical->instance->sample_mask_out_opengl_behaviour) + key->ignore_sample_mask_out = !key->multisample_fbo; + } + + key->coarse_pixel = + !key->persample_interp && + device->vk.enabled_extensions.KHR_fragment_shading_rate && + pipeline_has_coarse_pixel(pipeline, dynamic, ms, fsr); +} + +static void +populate_cs_prog_key(const struct anv_device *device, + bool robust_buffer_acccess, + struct brw_cs_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_acccess, &key->base); +} + +static void +populate_bs_prog_key(const struct anv_device *device, + bool robust_buffer_access, + struct brw_bs_prog_key *key) +{ + memset(key, 0, sizeof(*key)); + + populate_base_prog_key(device, robust_buffer_access, &key->base); +} + +struct anv_pipeline_stage { + gl_shader_stage stage; + + const VkPipelineShaderStageCreateInfo *info; + + unsigned char shader_sha1[20]; + + union brw_any_prog_key key; + + struct { + gl_shader_stage stage; + unsigned char sha1[20]; + } cache_key; + + nir_shader *nir; + + struct anv_pipeline_binding surface_to_descriptor[256]; + struct anv_pipeline_binding sampler_to_descriptor[256]; + struct anv_pipeline_bind_map bind_map; + + union brw_any_prog_data prog_data; + + uint32_t num_stats; + struct brw_compile_stats stats[3]; + char *disasm[3]; + + VkPipelineCreationFeedback feedback; + + const unsigned *code; + + struct anv_shader_bin *bin; +}; + +static void +anv_pipeline_hash_graphics(struct anv_graphics_pipeline *pipeline, + struct anv_pipeline_layout *layout, + struct anv_pipeline_stage *stages, + unsigned char *sha1_out) +{ + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + + _mesa_sha1_update(&ctx, &pipeline->view_mask, + sizeof(pipeline->view_mask)); + + if (layout) + _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); + + const bool rba = pipeline->base.device->robust_buffer_access; + _mesa_sha1_update(&ctx, &rba, sizeof(rba)); + + for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) { + if (stages[s].info) { + _mesa_sha1_update(&ctx, stages[s].shader_sha1, + sizeof(stages[s].shader_sha1)); + _mesa_sha1_update(&ctx, &stages[s].key, brw_prog_key_size(s)); + } + } + + _mesa_sha1_final(&ctx, sha1_out); +} + +static void +anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline, + struct anv_pipeline_layout *layout, + struct anv_pipeline_stage *stage, + unsigned char *sha1_out) +{ + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + + if (layout) + _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); + + const struct anv_device *device = pipeline->base.device; + + const bool rba = device->robust_buffer_access; + _mesa_sha1_update(&ctx, &rba, sizeof(rba)); + + const bool afs = device->physical->instance->assume_full_subgroups; + _mesa_sha1_update(&ctx, &afs, sizeof(afs)); + + _mesa_sha1_update(&ctx, stage->shader_sha1, + sizeof(stage->shader_sha1)); + _mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs)); + + _mesa_sha1_final(&ctx, sha1_out); +} + +static void +anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline, + struct anv_pipeline_layout *layout, + struct anv_pipeline_stage *stage, + unsigned char *sha1_out) +{ + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + + if (layout != NULL) + _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); + + const bool rba = pipeline->base.device->robust_buffer_access; + _mesa_sha1_update(&ctx, &rba, sizeof(rba)); + + _mesa_sha1_update(&ctx, stage->shader_sha1, sizeof(stage->shader_sha1)); + _mesa_sha1_update(&ctx, &stage->key, sizeof(stage->key.bs)); + + _mesa_sha1_final(&ctx, sha1_out); +} + +static void +anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *pipeline, + struct anv_pipeline_layout *layout, + struct anv_pipeline_stage *intersection, + struct anv_pipeline_stage *any_hit, + unsigned char *sha1_out) +{ + struct mesa_sha1 ctx; + _mesa_sha1_init(&ctx); + + if (layout != NULL) + _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1)); + + const bool rba = pipeline->base.device->robust_buffer_access; + _mesa_sha1_update(&ctx, &rba, sizeof(rba)); + + _mesa_sha1_update(&ctx, intersection->shader_sha1, sizeof(intersection->shader_sha1)); + _mesa_sha1_update(&ctx, &intersection->key, sizeof(intersection->key.bs)); + _mesa_sha1_update(&ctx, any_hit->shader_sha1, sizeof(any_hit->shader_sha1)); + _mesa_sha1_update(&ctx, &any_hit->key, sizeof(any_hit->key.bs)); + + _mesa_sha1_final(&ctx, sha1_out); +} + +static nir_shader * +anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline, + struct vk_pipeline_cache *cache, + void *mem_ctx, + struct anv_pipeline_stage *stage) +{ + const struct brw_compiler *compiler = + pipeline->device->physical->compiler; + const nir_shader_compiler_options *nir_options = + compiler->nir_options[stage->stage]; + nir_shader *nir; + + nir = anv_device_search_for_nir(pipeline->device, cache, + nir_options, + stage->shader_sha1, + mem_ctx); + if (nir) { + assert(nir->info.stage == stage->stage); + return nir; + } + + nir = anv_shader_stage_to_nir(pipeline->device, stage->info, mem_ctx); + if (nir) { + anv_device_upload_nir(pipeline->device, cache, nir, stage->shader_sha1); + return nir; + } + + return NULL; +} + +static void +shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + uint32_t comp_size = glsl_type_is_boolean(type) + ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + *size = comp_size * length, + *align = comp_size * (length == 3 ? 4 : length); +} + +static void +anv_pipeline_lower_nir(struct anv_pipeline *pipeline, + void *mem_ctx, + struct anv_pipeline_stage *stage, + struct anv_pipeline_layout *layout, + bool use_primitive_replication) +{ + const struct anv_physical_device *pdevice = pipeline->device->physical; + const struct brw_compiler *compiler = pdevice->compiler; + + struct brw_stage_prog_data *prog_data = &stage->prog_data.base; + nir_shader *nir = stage->nir; + + if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_wpos_center); + NIR_PASS(_, nir, nir_lower_input_attachments, + &(nir_input_attachment_options) { + .use_fragcoord_sysval = true, + .use_layer_id_sysval = true, + }); + } + + NIR_PASS(_, nir, anv_nir_lower_ycbcr_textures, layout); + + if (pipeline->type == ANV_PIPELINE_GRAPHICS) { + struct anv_graphics_pipeline *gfx_pipeline = + anv_pipeline_to_graphics(pipeline); + NIR_PASS(_, nir, anv_nir_lower_multiview, gfx_pipeline->view_mask, + use_primitive_replication); + } + + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + NIR_PASS(_, nir, brw_nir_lower_storage_image, compiler->devinfo); + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, + nir_address_format_64bit_global); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const, + nir_address_format_32bit_offset); + + NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info); + + /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ + NIR_PASS_V(nir, anv_nir_apply_pipeline_layout, + pdevice, pipeline->device->robust_buffer_access, + layout, &stage->bind_map); + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, + anv_nir_ubo_addr_format(pdevice, + pipeline->device->robust_buffer_access)); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, + anv_nir_ssbo_addr_format(pdevice, + pipeline->device->robust_buffer_access)); + + /* First run copy-prop to get rid of all of the vec() that address + * calculations often create and then constant-fold so that, when we + * get to anv_nir_lower_ubo_loads, we can detect constant offsets. + */ + NIR_PASS(_, nir, nir_copy_prop); + NIR_PASS(_, nir, nir_opt_constant_folding); + + NIR_PASS(_, nir, anv_nir_lower_ubo_loads); + + enum nir_lower_non_uniform_access_type lower_non_uniform_access_types = + nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access; + + /* In practice, most shaders do not have non-uniform-qualified + * accesses (see + * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069) + * thus a cheaper and likely to fail check is run first. + */ + if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) { + NIR_PASS(_, nir, nir_opt_non_uniform_access); + + /* We don't support non-uniform UBOs and non-uniform SSBO access is + * handled naturally by falling back to A64 messages. + */ + NIR_PASS(_, nir, nir_lower_non_uniform_access, + &(nir_lower_non_uniform_access_options) { + .types = lower_non_uniform_access_types, + .callback = NULL, + }); + } + + NIR_PASS_V(nir, anv_nir_compute_push_layout, + pdevice, pipeline->device->robust_buffer_access, + prog_data, &stage->bind_map, mem_ctx); + + if (gl_shader_stage_uses_workgroup(nir->info.stage)) { + if (!nir->info.shared_memory_explicit_layout) { + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_mem_shared, shared_type_info); + } + + NIR_PASS(_, nir, nir_lower_explicit_io, + nir_var_mem_shared, nir_address_format_32bit_offset); + + if (nir->info.zero_initialize_shared_memory && + nir->info.shared_size > 0) { + /* The effective Shared Local Memory size is at least 1024 bytes and + * is always rounded to a power of two, so it is OK to align the size + * used by the shader to chunk_size -- which does simplify the logic. + */ + const unsigned chunk_size = 16; + const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); + assert(shared_size <= + intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size)); + + NIR_PASS(_, nir, nir_zero_initialize_shared_memory, + shared_size, chunk_size); + } + } + + if (gl_shader_stage_is_compute(nir->info.stage) || + gl_shader_stage_is_mesh(nir->info.stage)) + NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics); + + stage->nir = nir; +} + +static void +anv_pipeline_link_vs(const struct brw_compiler *compiler, + struct anv_pipeline_stage *vs_stage, + struct anv_pipeline_stage *next_stage) +{ + if (next_stage) + brw_nir_link_shaders(compiler, vs_stage->nir, next_stage->nir); +} + +static void +anv_pipeline_compile_vs(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_graphics_pipeline *pipeline, + struct anv_pipeline_stage *vs_stage) +{ + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + uint32_t pos_slots = + (vs_stage->nir->info.per_view_outputs & VARYING_BIT_POS) ? + MAX2(1, util_bitcount(pipeline->view_mask)) : 1; + + /* Only position is allowed to be per-view */ + assert(!(vs_stage->nir->info.per_view_outputs & ~VARYING_BIT_POS)); + + brw_compute_vue_map(compiler->devinfo, + &vs_stage->prog_data.vs.base.vue_map, + vs_stage->nir->info.outputs_written, + vs_stage->nir->info.separate_shader, + pos_slots); + + vs_stage->num_stats = 1; + + struct brw_compile_vs_params params = { + .nir = vs_stage->nir, + .key = &vs_stage->key.vs, + .prog_data = &vs_stage->prog_data.vs, + .stats = vs_stage->stats, + .log_data = pipeline->base.device, + }; + + vs_stage->code = brw_compile_vs(compiler, mem_ctx, ¶ms); +} + +static void +merge_tess_info(struct shader_info *tes_info, + const struct shader_info *tcs_info) +{ + /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says: + * + * "PointMode. Controls generation of points rather than triangles + * or lines. This functionality defaults to disabled, and is + * enabled if either shader stage includes the execution mode. + * + * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw, + * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd, + * and OutputVertices, it says: + * + * "One mode must be set in at least one of the tessellation + * shader stages." + * + * So, the fields can be set in either the TCS or TES, but they must + * agree if set in both. Our backend looks at TES, so bitwise-or in + * the values from the TCS. + */ + assert(tcs_info->tess.tcs_vertices_out == 0 || + tes_info->tess.tcs_vertices_out == 0 || + tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out); + tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out; + + assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED || + tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED || + tcs_info->tess.spacing == tes_info->tess.spacing); + tes_info->tess.spacing |= tcs_info->tess.spacing; + + assert(tcs_info->tess._primitive_mode == 0 || + tes_info->tess._primitive_mode == 0 || + tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode); + tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode; + tes_info->tess.ccw |= tcs_info->tess.ccw; + tes_info->tess.point_mode |= tcs_info->tess.point_mode; +} + +static void +anv_pipeline_link_tcs(const struct brw_compiler *compiler, + struct anv_pipeline_stage *tcs_stage, + struct anv_pipeline_stage *tes_stage) +{ + assert(tes_stage && tes_stage->stage == MESA_SHADER_TESS_EVAL); + + brw_nir_link_shaders(compiler, tcs_stage->nir, tes_stage->nir); + + nir_lower_patch_vertices(tes_stage->nir, + tcs_stage->nir->info.tess.tcs_vertices_out, + NULL); + + /* Copy TCS info into the TES info */ + merge_tess_info(&tes_stage->nir->info, &tcs_stage->nir->info); + + /* Whacking the key after cache lookup is a bit sketchy, but all of + * this comes from the SPIR-V, which is part of the hash used for the + * pipeline cache. So it should be safe. + */ + tcs_stage->key.tcs._tes_primitive_mode = + tes_stage->nir->info.tess._primitive_mode; + tcs_stage->key.tcs.quads_workaround = + compiler->devinfo->ver < 9 && + tes_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_QUADS && + tes_stage->nir->info.tess.spacing == TESS_SPACING_EQUAL; +} + +static void +anv_pipeline_compile_tcs(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *tcs_stage, + struct anv_pipeline_stage *prev_stage) +{ + tcs_stage->key.tcs.outputs_written = + tcs_stage->nir->info.outputs_written; + tcs_stage->key.tcs.patch_outputs_written = + tcs_stage->nir->info.patch_outputs_written; + + tcs_stage->num_stats = 1; + + struct brw_compile_tcs_params params = { + .nir = tcs_stage->nir, + .key = &tcs_stage->key.tcs, + .prog_data = &tcs_stage->prog_data.tcs, + .stats = tcs_stage->stats, + .log_data = device, + }; + + tcs_stage->code = brw_compile_tcs(compiler, mem_ctx, ¶ms); +} + +static void +anv_pipeline_link_tes(const struct brw_compiler *compiler, + struct anv_pipeline_stage *tes_stage, + struct anv_pipeline_stage *next_stage) +{ + if (next_stage) + brw_nir_link_shaders(compiler, tes_stage->nir, next_stage->nir); +} + +static void +anv_pipeline_compile_tes(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *tes_stage, + struct anv_pipeline_stage *tcs_stage) +{ + tes_stage->key.tes.inputs_read = + tcs_stage->nir->info.outputs_written; + tes_stage->key.tes.patch_inputs_read = + tcs_stage->nir->info.patch_outputs_written; + + tes_stage->num_stats = 1; + + struct brw_compile_tes_params params = { + .nir = tes_stage->nir, + .key = &tes_stage->key.tes, + .prog_data = &tes_stage->prog_data.tes, + .input_vue_map = &tcs_stage->prog_data.tcs.base.vue_map, + .stats = tes_stage->stats, + .log_data = device, + }; + + tes_stage->code = brw_compile_tes(compiler, mem_ctx, ¶ms); +} + +static void +anv_pipeline_link_gs(const struct brw_compiler *compiler, + struct anv_pipeline_stage *gs_stage, + struct anv_pipeline_stage *next_stage) +{ + if (next_stage) + brw_nir_link_shaders(compiler, gs_stage->nir, next_stage->nir); +} + +static void +anv_pipeline_compile_gs(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *gs_stage, + struct anv_pipeline_stage *prev_stage) +{ + brw_compute_vue_map(compiler->devinfo, + &gs_stage->prog_data.gs.base.vue_map, + gs_stage->nir->info.outputs_written, + gs_stage->nir->info.separate_shader, 1); + + gs_stage->num_stats = 1; + + struct brw_compile_gs_params params = { + .nir = gs_stage->nir, + .key = &gs_stage->key.gs, + .prog_data = &gs_stage->prog_data.gs, + .stats = gs_stage->stats, + .log_data = device, + }; + + gs_stage->code = brw_compile_gs(compiler, mem_ctx, ¶ms); +} + +static void +anv_pipeline_link_task(const struct brw_compiler *compiler, + struct anv_pipeline_stage *task_stage, + struct anv_pipeline_stage *next_stage) +{ + assert(next_stage); + assert(next_stage->stage == MESA_SHADER_MESH); + brw_nir_link_shaders(compiler, task_stage->nir, next_stage->nir); +} + +static void +anv_pipeline_compile_task(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *task_stage) +{ + task_stage->num_stats = 1; + + struct brw_compile_task_params params = { + .nir = task_stage->nir, + .key = &task_stage->key.task, + .prog_data = &task_stage->prog_data.task, + .stats = task_stage->stats, + .log_data = device, + }; + + task_stage->code = brw_compile_task(compiler, mem_ctx, ¶ms); +} + +static void +anv_pipeline_link_mesh(const struct brw_compiler *compiler, + struct anv_pipeline_stage *mesh_stage, + struct anv_pipeline_stage *next_stage) +{ + if (next_stage) { + brw_nir_link_shaders(compiler, mesh_stage->nir, next_stage->nir); + } +} + +static void +anv_pipeline_compile_mesh(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *mesh_stage, + struct anv_pipeline_stage *prev_stage) +{ + mesh_stage->num_stats = 1; + + struct brw_compile_mesh_params params = { + .nir = mesh_stage->nir, + .key = &mesh_stage->key.mesh, + .prog_data = &mesh_stage->prog_data.mesh, + .stats = mesh_stage->stats, + .log_data = device, + }; + + if (prev_stage) { + assert(prev_stage->stage == MESA_SHADER_TASK); + params.tue_map = &prev_stage->prog_data.task.map; + } + + mesh_stage->code = brw_compile_mesh(compiler, mem_ctx, ¶ms); +} + +static void +anv_pipeline_link_fs(const struct brw_compiler *compiler, + struct anv_pipeline_stage *stage, + const struct vk_render_pass_state *rp) +{ + /* Initially the valid outputs value is set to all possible render targets + * valid (see populate_wm_prog_key()), before we look at the shader + * variables. Here we look at the output variables of the shader an compute + * a correct number of render target outputs. + */ + stage->key.wm.color_outputs_valid = 0; + nir_foreach_shader_out_variable_safe(var, stage->nir) { + if (var->data.location < FRAG_RESULT_DATA0) + continue; + + const unsigned rt = var->data.location - FRAG_RESULT_DATA0; + const unsigned array_len = + glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1; + assert(rt + array_len <= MAX_RTS); + + stage->key.wm.color_outputs_valid |= BITFIELD_RANGE(rt, array_len); + } + stage->key.wm.color_outputs_valid &= + (1u << rp->color_attachment_count) - 1; + stage->key.wm.nr_color_regions = + util_last_bit(stage->key.wm.color_outputs_valid); + + unsigned num_rt_bindings; + struct anv_pipeline_binding rt_bindings[MAX_RTS]; + if (stage->key.wm.nr_color_regions > 0) { + assert(stage->key.wm.nr_color_regions <= MAX_RTS); + for (unsigned rt = 0; rt < stage->key.wm.nr_color_regions; rt++) { + if (stage->key.wm.color_outputs_valid & BITFIELD_BIT(rt)) { + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = rt, + }; + } else { + /* Setup a null render target */ + rt_bindings[rt] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = UINT32_MAX, + }; + } + } + num_rt_bindings = stage->key.wm.nr_color_regions; + } else { + /* Setup a null render target */ + rt_bindings[0] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS, + .index = UINT32_MAX, + }; + num_rt_bindings = 1; + } + + assert(num_rt_bindings <= MAX_RTS); + assert(stage->bind_map.surface_count == 0); + typed_memcpy(stage->bind_map.surface_to_descriptor, + rt_bindings, num_rt_bindings); + stage->bind_map.surface_count += num_rt_bindings; +} + +static void +anv_pipeline_compile_fs(const struct brw_compiler *compiler, + void *mem_ctx, + struct anv_device *device, + struct anv_pipeline_stage *fs_stage, + struct anv_pipeline_stage *prev_stage) +{ + /* TODO: we could set this to 0 based on the information in nir_shader, but + * we need this before we call spirv_to_nir. + */ + assert(prev_stage); + + struct brw_compile_fs_params params = { + .nir = fs_stage->nir, + .key = &fs_stage->key.wm, + .prog_data = &fs_stage->prog_data.wm, + + .allow_spilling = true, + .stats = fs_stage->stats, + .log_data = device, + }; + + if (prev_stage->stage == MESA_SHADER_MESH) { + params.mue_map = &prev_stage->prog_data.mesh.map; + /* TODO(mesh): Slots valid, do we even use/rely on it? */ + } else { + fs_stage->key.wm.input_slots_valid = + prev_stage->prog_data.vue.vue_map.slots_valid; + } + + fs_stage->code = brw_compile_fs(compiler, mem_ctx, ¶ms); + + fs_stage->num_stats = (uint32_t)fs_stage->prog_data.wm.dispatch_8 + + (uint32_t)fs_stage->prog_data.wm.dispatch_16 + + (uint32_t)fs_stage->prog_data.wm.dispatch_32; +} + +static void +anv_pipeline_add_executable(struct anv_pipeline *pipeline, + struct anv_pipeline_stage *stage, + struct brw_compile_stats *stats, + uint32_t code_offset) +{ + char *nir = NULL; + if (stage->nir && + (pipeline->flags & + VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) { + nir = nir_shader_as_str(stage->nir, pipeline->mem_ctx); + } + + char *disasm = NULL; + if (stage->code && + (pipeline->flags & + VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) { + char *stream_data = NULL; + size_t stream_size = 0; + FILE *stream = open_memstream(&stream_data, &stream_size); + + uint32_t push_size = 0; + for (unsigned i = 0; i < 4; i++) + push_size += stage->bind_map.push_ranges[i].length; + if (push_size > 0) { + fprintf(stream, "Push constant ranges:\n"); + for (unsigned i = 0; i < 4; i++) { + if (stage->bind_map.push_ranges[i].length == 0) + continue; + + fprintf(stream, " RANGE%d (%dB): ", i, + stage->bind_map.push_ranges[i].length * 32); + + switch (stage->bind_map.push_ranges[i].set) { + case ANV_DESCRIPTOR_SET_NULL: + fprintf(stream, "NULL"); + break; + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: + fprintf(stream, "Vulkan push constants and API params"); + break; + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: + fprintf(stream, "Descriptor buffer for set %d (start=%dB)", + stage->bind_map.push_ranges[i].index, + stage->bind_map.push_ranges[i].start * 32); + break; + + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: + unreachable("gl_NumWorkgroups is never pushed"); + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: + fprintf(stream, "Inline shader constant data (start=%dB)", + stage->bind_map.push_ranges[i].start * 32); + break; + + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: + unreachable("Color attachments can't be pushed"); + + default: + fprintf(stream, "UBO (set=%d binding=%d start=%dB)", + stage->bind_map.push_ranges[i].set, + stage->bind_map.push_ranges[i].index, + stage->bind_map.push_ranges[i].start * 32); + break; + } + fprintf(stream, "\n"); + } + fprintf(stream, "\n"); + } + + /* Creating this is far cheaper than it looks. It's perfectly fine to + * do it for every binary. + */ + intel_disassemble(&pipeline->device->physical->compiler->isa, + stage->code, code_offset, stream); + + fclose(stream); + + /* Copy it to a ralloc'd thing */ + disasm = ralloc_size(pipeline->mem_ctx, stream_size + 1); + memcpy(disasm, stream_data, stream_size); + disasm[stream_size] = 0; + + free(stream_data); + } + + const struct anv_pipeline_executable exe = { + .stage = stage->stage, + .stats = *stats, + .nir = nir, + .disasm = disasm, + }; + util_dynarray_append(&pipeline->executables, + struct anv_pipeline_executable, exe); +} + +static void +anv_pipeline_add_executables(struct anv_pipeline *pipeline, + struct anv_pipeline_stage *stage, + struct anv_shader_bin *bin) +{ + if (stage->stage == MESA_SHADER_FRAGMENT) { + /* We pull the prog data and stats out of the anv_shader_bin because + * the anv_pipeline_stage may not be fully populated if we successfully + * looked up the shader in a cache. + */ + const struct brw_wm_prog_data *wm_prog_data = + (const struct brw_wm_prog_data *)bin->prog_data; + struct brw_compile_stats *stats = bin->stats; + + if (wm_prog_data->dispatch_8) { + anv_pipeline_add_executable(pipeline, stage, stats++, 0); + } + + if (wm_prog_data->dispatch_16) { + anv_pipeline_add_executable(pipeline, stage, stats++, + wm_prog_data->prog_offset_16); + } + + if (wm_prog_data->dispatch_32) { + anv_pipeline_add_executable(pipeline, stage, stats++, + wm_prog_data->prog_offset_32); + } + } else { + anv_pipeline_add_executable(pipeline, stage, bin->stats, 0); + } + + pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries); +} + +static void +anv_graphics_pipeline_init_keys(struct anv_graphics_pipeline *pipeline, + const struct vk_graphics_pipeline_state *state, + struct anv_pipeline_stage *stages) +{ + for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) { + if (!stages[s].info) + continue; + + int64_t stage_start = os_time_get_nano(); + + vk_pipeline_hash_shader_stage(stages[s].info, stages[s].shader_sha1); + + const struct anv_device *device = pipeline->base.device; + switch (stages[s].stage) { + case MESA_SHADER_VERTEX: + populate_vs_prog_key(device, + pipeline->base.device->robust_buffer_access, + &stages[s].key.vs); + break; + case MESA_SHADER_TESS_CTRL: + populate_tcs_prog_key(device, + pipeline->base.device->robust_buffer_access, + state->ts->patch_control_points, + &stages[s].key.tcs); + break; + case MESA_SHADER_TESS_EVAL: + populate_tes_prog_key(device, + pipeline->base.device->robust_buffer_access, + &stages[s].key.tes); + break; + case MESA_SHADER_GEOMETRY: + populate_gs_prog_key(device, + pipeline->base.device->robust_buffer_access, + &stages[s].key.gs); + break; + case MESA_SHADER_FRAGMENT: { + populate_wm_prog_key(pipeline, + pipeline->base.device->robust_buffer_access, + state->dynamic, state->ms, state->fsr, state->rp, + &stages[s].key.wm); + break; + } + case MESA_SHADER_TASK: + populate_task_prog_key(device, + pipeline->base.device->robust_buffer_access, + &stages[s].key.task); + break; + case MESA_SHADER_MESH: + populate_mesh_prog_key(device, + pipeline->base.device->robust_buffer_access, + &stages[s].key.mesh); + break; + default: + unreachable("Invalid graphics shader stage"); + } + + stages[s].feedback.duration += os_time_get_nano() - stage_start; + stages[s].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT; + } + + assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT || + pipeline->active_stages & VK_SHADER_STAGE_MESH_BIT_NV); +} + +static bool +anv_graphics_pipeline_load_cached_shaders(struct anv_graphics_pipeline *pipeline, + struct vk_pipeline_cache *cache, + struct anv_pipeline_stage *stages, + VkPipelineCreationFeedbackEXT *pipeline_feedback) +{ + unsigned found = 0; + unsigned cache_hits = 0; + for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) { + if (!stages[s].info) + continue; + + int64_t stage_start = os_time_get_nano(); + + bool cache_hit; + struct anv_shader_bin *bin = + anv_device_search_for_kernel(pipeline->base.device, cache, + &stages[s].cache_key, + sizeof(stages[s].cache_key), &cache_hit); + if (bin) { + found++; + pipeline->shaders[s] = bin; + } + + if (cache_hit) { + cache_hits++; + stages[s].feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + } + stages[s].feedback.duration += os_time_get_nano() - stage_start; + } + + if (found == __builtin_popcount(pipeline->active_stages)) { + if (cache_hits == found) { + pipeline_feedback->flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + } + /* We found all our shaders in the cache. We're done. */ + for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) { + if (!stages[s].info) + continue; + + anv_pipeline_add_executables(&pipeline->base, &stages[s], + pipeline->shaders[s]); + } + return true; + } else if (found > 0) { + /* We found some but not all of our shaders. This shouldn't happen most + * of the time but it can if we have a partially populated pipeline + * cache. + */ + assert(found < __builtin_popcount(pipeline->active_stages)); + + vk_perf(VK_LOG_OBJS(cache ? &cache->base : + &pipeline->base.device->vk.base), + "Found a partial pipeline in the cache. This is " + "most likely caused by an incomplete pipeline cache " + "import or export"); + + /* We're going to have to recompile anyway, so just throw away our + * references to the shaders in the cache. We'll get them out of the + * cache again as part of the compilation process. + */ + for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) { + stages[s].feedback.flags = 0; + if (pipeline->shaders[s]) { + anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]); + pipeline->shaders[s] = NULL; + } + } + } + + return false; +} + +static const gl_shader_stage graphics_shader_order[] = { + MESA_SHADER_VERTEX, + MESA_SHADER_TESS_CTRL, + MESA_SHADER_TESS_EVAL, + MESA_SHADER_GEOMETRY, + + MESA_SHADER_TASK, + MESA_SHADER_MESH, + + MESA_SHADER_FRAGMENT, +}; + +static VkResult +anv_graphics_pipeline_load_nir(struct anv_graphics_pipeline *pipeline, + struct vk_pipeline_cache *cache, + struct anv_pipeline_stage *stages, + void *pipeline_ctx) +{ + for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) { + gl_shader_stage s = graphics_shader_order[i]; + if (!stages[s].info) + continue; + + int64_t stage_start = os_time_get_nano(); + + assert(stages[s].stage == s); + assert(pipeline->shaders[s] == NULL); + + stages[s].bind_map = (struct anv_pipeline_bind_map) { + .surface_to_descriptor = stages[s].surface_to_descriptor, + .sampler_to_descriptor = stages[s].sampler_to_descriptor + }; + + stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, + pipeline_ctx, + &stages[s]); + if (stages[s].nir == NULL) { + return vk_error(pipeline, VK_ERROR_UNKNOWN); + } + + stages[s].feedback.duration += os_time_get_nano() - stage_start; + } + + return VK_SUCCESS; +} + +static VkResult +anv_graphics_pipeline_compile(struct anv_graphics_pipeline *pipeline, + struct vk_pipeline_cache *cache, + const VkGraphicsPipelineCreateInfo *info, + const struct vk_graphics_pipeline_state *state) +{ + ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout); + VkResult result; + + VkPipelineCreationFeedbackEXT pipeline_feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, + }; + int64_t pipeline_start = os_time_get_nano(); + + const struct brw_compiler *compiler = pipeline->base.device->physical->compiler; + struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {}; + for (uint32_t i = 0; i < info->stageCount; i++) { + gl_shader_stage stage = vk_to_mesa_shader_stage(info->pStages[i].stage); + stages[stage].stage = stage; + stages[stage].info = &info->pStages[i]; + } + + anv_graphics_pipeline_init_keys(pipeline, state, stages); + + unsigned char sha1[20]; + anv_pipeline_hash_graphics(pipeline, layout, stages, sha1); + + for (unsigned s = 0; s < ARRAY_SIZE(stages); s++) { + if (!stages[s].info) + continue; + + stages[s].cache_key.stage = s; + memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1)); + } + + const bool skip_cache_lookup = + (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR); + if (!skip_cache_lookup) { + bool found_all_shaders = + anv_graphics_pipeline_load_cached_shaders(pipeline, cache, stages, + &pipeline_feedback); + if (found_all_shaders) + goto done; + } + + if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) + return VK_PIPELINE_COMPILE_REQUIRED; + + void *pipeline_ctx = ralloc_context(NULL); + + result = anv_graphics_pipeline_load_nir(pipeline, cache, stages, + pipeline_ctx); + if (result != VK_SUCCESS) + goto fail; + + /* Walk backwards to link */ + struct anv_pipeline_stage *next_stage = NULL; + for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) { + gl_shader_stage s = graphics_shader_order[i]; + if (!stages[s].info) + continue; + + switch (s) { + case MESA_SHADER_VERTEX: + anv_pipeline_link_vs(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_TESS_CTRL: + anv_pipeline_link_tcs(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_TESS_EVAL: + anv_pipeline_link_tes(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_GEOMETRY: + anv_pipeline_link_gs(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_TASK: + anv_pipeline_link_task(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_MESH: + anv_pipeline_link_mesh(compiler, &stages[s], next_stage); + break; + case MESA_SHADER_FRAGMENT: + anv_pipeline_link_fs(compiler, &stages[s], state->rp); + break; + default: + unreachable("Invalid graphics shader stage"); + } + + next_stage = &stages[s]; + } + + bool use_primitive_replication = false; + if (pipeline->base.device->info->ver >= 12 && + pipeline->view_mask != 0) { + /* For some pipelines HW Primitive Replication can be used instead of + * instancing to implement Multiview. This depend on how viewIndex is + * used in all the active shaders, so this check can't be done per + * individual shaders. + */ + nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {}; + for (unsigned s = 0; s < ARRAY_SIZE(shaders); s++) + shaders[s] = stages[s].nir; + + use_primitive_replication = + anv_check_for_primitive_replication(pipeline->base.device, + pipeline->active_stages, + shaders, pipeline->view_mask); + } + + struct anv_pipeline_stage *prev_stage = NULL; + for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) { + gl_shader_stage s = graphics_shader_order[i]; + if (!stages[s].info) + continue; + + int64_t stage_start = os_time_get_nano(); + + void *stage_ctx = ralloc_context(NULL); + + anv_pipeline_lower_nir(&pipeline->base, stage_ctx, &stages[s], layout, + use_primitive_replication); + + if (prev_stage && compiler->nir_options[s]->unify_interfaces) { + prev_stage->nir->info.outputs_written |= stages[s].nir->info.inputs_read & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + stages[s].nir->info.inputs_read |= prev_stage->nir->info.outputs_written & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + prev_stage->nir->info.patch_outputs_written |= stages[s].nir->info.patch_inputs_read; + stages[s].nir->info.patch_inputs_read |= prev_stage->nir->info.patch_outputs_written; + } + + ralloc_free(stage_ctx); + + stages[s].feedback.duration += os_time_get_nano() - stage_start; + + prev_stage = &stages[s]; + } + + /* In the case the platform can write the primitive variable shading rate, + * figure out the last geometry stage that should write the primitive + * shading rate, and ensure it is marked as used there. The backend will + * write a default value if the shader doesn't actually write it. + * + * We iterate backwards in the stage and stop on the first shader that can + * set the value. + */ + const struct intel_device_info *devinfo = pipeline->base.device->info; + if (devinfo->has_coarse_pixel_primitive_and_cb && + stages[MESA_SHADER_FRAGMENT].info && + stages[MESA_SHADER_FRAGMENT].key.wm.coarse_pixel && + !stages[MESA_SHADER_FRAGMENT].nir->info.fs.uses_sample_shading && + stages[MESA_SHADER_MESH].info == NULL) { + struct anv_pipeline_stage *last_psr = NULL; + + for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) { + gl_shader_stage s = + graphics_shader_order[ARRAY_SIZE(graphics_shader_order) - i - 1]; + + if (!stages[s].info || + !gl_shader_stage_can_set_fragment_shading_rate(s)) + continue; + + last_psr = &stages[s]; + break; + } + + assert(last_psr); + last_psr->nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_SHADING_RATE; + } + + prev_stage = NULL; + for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) { + gl_shader_stage s = graphics_shader_order[i]; + if (!stages[s].info) + continue; + + int64_t stage_start = os_time_get_nano(); + + void *stage_ctx = ralloc_context(NULL); + + switch (s) { + case MESA_SHADER_VERTEX: + anv_pipeline_compile_vs(compiler, stage_ctx, pipeline, + &stages[s]); + break; + case MESA_SHADER_TESS_CTRL: + anv_pipeline_compile_tcs(compiler, stage_ctx, pipeline->base.device, + &stages[s], prev_stage); + break; + case MESA_SHADER_TESS_EVAL: + anv_pipeline_compile_tes(compiler, stage_ctx, pipeline->base.device, + &stages[s], prev_stage); + break; + case MESA_SHADER_GEOMETRY: + anv_pipeline_compile_gs(compiler, stage_ctx, pipeline->base.device, + &stages[s], prev_stage); + break; + case MESA_SHADER_TASK: + anv_pipeline_compile_task(compiler, stage_ctx, pipeline->base.device, + &stages[s]); + break; + case MESA_SHADER_MESH: + anv_pipeline_compile_mesh(compiler, stage_ctx, pipeline->base.device, + &stages[s], prev_stage); + break; + case MESA_SHADER_FRAGMENT: + anv_pipeline_compile_fs(compiler, stage_ctx, pipeline->base.device, + &stages[s], prev_stage); + break; + default: + unreachable("Invalid graphics shader stage"); + } + if (stages[s].code == NULL) { + ralloc_free(stage_ctx); + result = vk_error(pipeline->base.device, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + anv_nir_validate_push_layout(&stages[s].prog_data.base, + &stages[s].bind_map); + + struct anv_shader_bin *bin = + anv_device_upload_kernel(pipeline->base.device, cache, s, + &stages[s].cache_key, + sizeof(stages[s].cache_key), + stages[s].code, + stages[s].prog_data.base.program_size, + &stages[s].prog_data.base, + brw_prog_data_size(s), + stages[s].stats, stages[s].num_stats, + stages[s].nir->xfb_info, + &stages[s].bind_map); + if (!bin) { + ralloc_free(stage_ctx); + result = vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + anv_pipeline_add_executables(&pipeline->base, &stages[s], bin); + + pipeline->shaders[s] = bin; + ralloc_free(stage_ctx); + + stages[s].feedback.duration += os_time_get_nano() - stage_start; + + prev_stage = &stages[s]; + } + + ralloc_free(pipeline_ctx); + +done: + + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; + + const VkPipelineCreationFeedbackCreateInfo *create_feedback = + vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); + if (create_feedback) { + *create_feedback->pPipelineCreationFeedback = pipeline_feedback; + + assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount); + for (uint32_t i = 0; i < info->stageCount; i++) { + gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage); + create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback; + } + } + + return VK_SUCCESS; + +fail: + ralloc_free(pipeline_ctx); + + for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) { + if (pipeline->shaders[s]) + anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]); + } + + return result; +} + +static VkResult +anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, + struct vk_pipeline_cache *cache, + const VkComputePipelineCreateInfo *info) +{ + const VkPipelineShaderStageCreateInfo *sinfo = &info->stage; + assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT); + + VkPipelineCreationFeedback pipeline_feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, + }; + int64_t pipeline_start = os_time_get_nano(); + + struct anv_device *device = pipeline->base.device; + const struct brw_compiler *compiler = device->physical->compiler; + + struct anv_pipeline_stage stage = { + .stage = MESA_SHADER_COMPUTE, + .info = &info->stage, + .cache_key = { + .stage = MESA_SHADER_COMPUTE, + }, + .feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, + }, + }; + vk_pipeline_hash_shader_stage(&info->stage, stage.shader_sha1); + + struct anv_shader_bin *bin = NULL; + + populate_cs_prog_key(device, device->robust_buffer_access, &stage.key.cs); + + ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout); + + const bool skip_cache_lookup = + (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR); + + anv_pipeline_hash_compute(pipeline, layout, &stage, stage.cache_key.sha1); + + bool cache_hit = false; + if (!skip_cache_lookup) { + bin = anv_device_search_for_kernel(device, cache, + &stage.cache_key, + sizeof(stage.cache_key), + &cache_hit); + } + + if (bin == NULL && + (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)) + return VK_PIPELINE_COMPILE_REQUIRED; + + void *mem_ctx = ralloc_context(NULL); + if (bin == NULL) { + int64_t stage_start = os_time_get_nano(); + + stage.bind_map = (struct anv_pipeline_bind_map) { + .surface_to_descriptor = stage.surface_to_descriptor, + .sampler_to_descriptor = stage.sampler_to_descriptor + }; + + /* Set up a binding for the gl_NumWorkGroups */ + stage.bind_map.surface_count = 1; + stage.bind_map.surface_to_descriptor[0] = (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS, + }; + + stage.nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, mem_ctx, &stage); + if (stage.nir == NULL) { + ralloc_free(mem_ctx); + return vk_error(pipeline, VK_ERROR_UNKNOWN); + } + + NIR_PASS(_, stage.nir, anv_nir_add_base_work_group_id); + + anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage, layout, + false /* use_primitive_replication */); + + unsigned local_size = stage.nir->info.workgroup_size[0] * + stage.nir->info.workgroup_size[1] * + stage.nir->info.workgroup_size[2]; + + /* Games don't always request full subgroups when they should, + * which can cause bugs, as they may expect bigger size of the + * subgroup than we choose for the execution. + */ + if (device->physical->instance->assume_full_subgroups && + stage.nir->info.cs.uses_wide_subgroup_intrinsics && + stage.nir->info.subgroup_size == SUBGROUP_SIZE_API_CONSTANT && + local_size && + local_size % BRW_SUBGROUP_SIZE == 0) + stage.nir->info.subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS; + + /* If the client requests that we dispatch full subgroups but doesn't + * allow us to pick a subgroup size, we have to smash it to the API + * value of 32. Performance will likely be terrible in this case but + * there's nothing we can do about that. The client should have chosen + * a size. + */ + if (stage.nir->info.subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS) + stage.nir->info.subgroup_size = BRW_SUBGROUP_SIZE; + + stage.num_stats = 1; + + struct brw_compile_cs_params params = { + .nir = stage.nir, + .key = &stage.key.cs, + .prog_data = &stage.prog_data.cs, + .stats = stage.stats, + .log_data = device, + }; + + stage.code = brw_compile_cs(compiler, mem_ctx, ¶ms); + if (stage.code == NULL) { + ralloc_free(mem_ctx); + return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map); + + if (!stage.prog_data.cs.uses_num_work_groups) { + assert(stage.bind_map.surface_to_descriptor[0].set == + ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS); + stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL; + } + + const unsigned code_size = stage.prog_data.base.program_size; + bin = anv_device_upload_kernel(device, cache, + MESA_SHADER_COMPUTE, + &stage.cache_key, sizeof(stage.cache_key), + stage.code, code_size, + &stage.prog_data.base, + sizeof(stage.prog_data.cs), + stage.stats, stage.num_stats, + NULL, &stage.bind_map); + if (!bin) { + ralloc_free(mem_ctx); + return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + stage.feedback.duration = os_time_get_nano() - stage_start; + } + + anv_pipeline_add_executables(&pipeline->base, &stage, bin); + + ralloc_free(mem_ctx); + + if (cache_hit) { + stage.feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + pipeline_feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + } + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; + + const VkPipelineCreationFeedbackCreateInfo *create_feedback = + vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); + if (create_feedback) { + *create_feedback->pPipelineCreationFeedback = pipeline_feedback; + + assert(create_feedback->pipelineStageCreationFeedbackCount == 1); + create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback; + } + + pipeline->cs = bin; + + return VK_SUCCESS; +} + +static VkResult +anv_compute_pipeline_create(struct anv_device *device, + struct vk_pipeline_cache *cache, + const VkComputePipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline) +{ + struct anv_compute_pipeline *pipeline; + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO); + + pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pipeline == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = anv_pipeline_init(&pipeline->base, device, + ANV_PIPELINE_COMPUTE, pCreateInfo->flags, + pAllocator); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS, + pipeline->batch_data, sizeof(pipeline->batch_data)); + + result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo); + if (result != VK_SUCCESS) { + anv_pipeline_finish(&pipeline->base, device, pAllocator); + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + anv_genX(device->info, compute_pipeline_emit)(pipeline); + + *pPipeline = anv_pipeline_to_handle(&pipeline->base); + + return pipeline->base.batch.status; +} + +VkResult anv_CreateComputePipelines( + VkDevice _device, + VkPipelineCache pipelineCache, + uint32_t count, + const VkComputePipelineCreateInfo* pCreateInfos, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipelines) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache); + + VkResult result = VK_SUCCESS; + + unsigned i; + for (i = 0; i < count; i++) { + VkResult res = anv_compute_pipeline_create(device, pipeline_cache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + + if (res == VK_SUCCESS) + continue; + + /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it + * is not obvious what error should be report upon 2 different failures. + * */ + result = res; + if (res != VK_PIPELINE_COMPILE_REQUIRED) + break; + + pPipelines[i] = VK_NULL_HANDLE; + + if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) + break; + } + + for (; i < count; i++) + pPipelines[i] = VK_NULL_HANDLE; + + return result; +} + +/** + * Calculate the desired L3 partitioning based on the current state of the + * pipeline. For now this simply returns the conservative defaults calculated + * by get_default_l3_weights(), but we could probably do better by gathering + * more statistics from the pipeline state (e.g. guess of expected URB usage + * and bound surfaces), or by using feed-back from performance counters. + */ +void +anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm) +{ + const struct intel_device_info *devinfo = pipeline->device->info; + + const struct intel_l3_weights w = + intel_get_default_l3_weights(devinfo, true, needs_slm); + + pipeline->l3_config = intel_get_l3_config(devinfo, w); +} + +static VkResult +anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline, + struct anv_device *device, + struct vk_pipeline_cache *cache, + const struct VkGraphicsPipelineCreateInfo *pCreateInfo, + const struct vk_graphics_pipeline_state *state, + const VkAllocationCallbacks *alloc) +{ + VkResult result; + + result = anv_pipeline_init(&pipeline->base, device, + ANV_PIPELINE_GRAPHICS, pCreateInfo->flags, + alloc); + if (result != VK_SUCCESS) + return result; + + anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS, + pipeline->batch_data, sizeof(pipeline->batch_data)); + + pipeline->active_stages = 0; + for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) + pipeline->active_stages |= pCreateInfo->pStages[i].stage; + + if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) + pipeline->active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; + + if (anv_pipeline_is_mesh(pipeline)) + assert(device->physical->vk.supported_extensions.NV_mesh_shader); + + pipeline->dynamic_state.ms.sample_locations = &pipeline->sample_locations; + vk_dynamic_graphics_state_fill(&pipeline->dynamic_state, state); + + pipeline->depth_clamp_enable = state->rs->depth_clamp_enable; + pipeline->depth_clip_enable = state->rs->depth_clip_enable; + pipeline->view_mask = state->rp->view_mask; + + result = anv_graphics_pipeline_compile(pipeline, cache, pCreateInfo, state); + if (result != VK_SUCCESS) { + anv_pipeline_finish(&pipeline->base, device, alloc); + return result; + } + + anv_pipeline_setup_l3_config(&pipeline->base, false); + + if (anv_pipeline_is_primitive(pipeline)) { + const uint64_t inputs_read = get_vs_prog_data(pipeline)->inputs_read; + + u_foreach_bit(a, state->vi->attributes_valid) { + if (inputs_read & BITFIELD64_BIT(VERT_ATTRIB_GENERIC0 + a)) + pipeline->vb_used |= BITFIELD64_BIT(state->vi->attributes[a].binding); + } + + u_foreach_bit(b, state->vi->bindings_valid) { + pipeline->vb[b].stride = state->vi->bindings[b].stride; + pipeline->vb[b].instanced = state->vi->bindings[b].input_rate == + VK_VERTEX_INPUT_RATE_INSTANCE; + pipeline->vb[b].instance_divisor = state->vi->bindings[b].divisor; + } + + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views when primitive replication cannot be used. If the + * client asks for instancing, we need to multiply by the client's + * instance count at draw time and instance divisor in the vertex + * bindings by the number of views ensure that we repeat the client's + * per-instance data once for each view. + */ + const bool uses_primitive_replication = + anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots > 1; + pipeline->instance_multiplier = 1; + if (pipeline->view_mask && !uses_primitive_replication) + pipeline->instance_multiplier = util_bitcount(pipeline->view_mask); + } else { + assert(anv_pipeline_is_mesh(pipeline)); + /* TODO(mesh): Mesh vs. Multiview with Instancing. */ + } + + pipeline->negative_one_to_one = + state->vp != NULL && state->vp->negative_one_to_one; + + /* Store line mode, polygon mode and rasterization samples, these are used + * for dynamic primitive topology. + */ + pipeline->polygon_mode = state->rs->polygon_mode; + pipeline->rasterization_samples = + state->ms != NULL ? state->ms->rasterization_samples : 1; + pipeline->line_mode = state->rs->line.mode; + if (pipeline->line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT) { + if (pipeline->rasterization_samples > 1) { + pipeline->line_mode = VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT; + } else { + pipeline->line_mode = VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT; + } + } + pipeline->patch_control_points = + state->ts != NULL ? state->ts->patch_control_points : 0; + + /* Store the color write masks, to be merged with color write enable if + * dynamic. + */ + if (state->cb != NULL) { + for (unsigned i = 0; i < state->cb->attachment_count; i++) + pipeline->color_comp_writes[i] = state->cb->attachments[i].write_mask; + } + + return VK_SUCCESS; +} + +static VkResult +anv_graphics_pipeline_create(struct anv_device *device, + struct vk_pipeline_cache *cache, + const VkGraphicsPipelineCreateInfo *pCreateInfo, + const VkAllocationCallbacks *pAllocator, + VkPipeline *pPipeline) +{ + struct anv_graphics_pipeline *pipeline; + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO); + + pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (pipeline == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + struct vk_graphics_pipeline_all_state all; + struct vk_graphics_pipeline_state state = { }; + result = vk_graphics_pipeline_state_fill(&device->vk, &state, pCreateInfo, + NULL /* sp_info */, + &all, NULL, 0, NULL); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + result = anv_graphics_pipeline_init(pipeline, device, cache, + pCreateInfo, &state, pAllocator); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + anv_genX(device->info, graphics_pipeline_emit)(pipeline, &state); + + *pPipeline = anv_pipeline_to_handle(&pipeline->base); + + return pipeline->base.batch.status; +} + +VkResult anv_CreateGraphicsPipelines( + VkDevice _device, + VkPipelineCache pipelineCache, + uint32_t count, + const VkGraphicsPipelineCreateInfo* pCreateInfos, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipelines) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache); + + VkResult result = VK_SUCCESS; + + unsigned i; + for (i = 0; i < count; i++) { + VkResult res = anv_graphics_pipeline_create(device, + pipeline_cache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + + if (res == VK_SUCCESS) + continue; + + /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it + * is not obvious what error should be report upon 2 different failures. + * */ + result = res; + if (res != VK_PIPELINE_COMPILE_REQUIRED) + break; + + pPipelines[i] = VK_NULL_HANDLE; + + if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) + break; + } + + for (; i < count; i++) + pPipelines[i] = VK_NULL_HANDLE; + + return result; +} + +static VkResult +compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline, + struct vk_pipeline_cache *cache, + nir_shader *nir, + struct anv_pipeline_stage *stage, + struct anv_shader_bin **shader_out, + void *mem_ctx) +{ + const struct brw_compiler *compiler = + pipeline->base.device->physical->compiler; + const struct intel_device_info *devinfo = compiler->devinfo; + + nir_shader **resume_shaders = NULL; + uint32_t num_resume_shaders = 0; + if (nir->info.stage != MESA_SHADER_COMPUTE) { + NIR_PASS(_, nir, nir_lower_shader_calls, + nir_address_format_64bit_global, + BRW_BTD_STACK_ALIGN, + &resume_shaders, &num_resume_shaders, mem_ctx); + NIR_PASS(_, nir, brw_nir_lower_shader_calls); + NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo); + } + + for (unsigned i = 0; i < num_resume_shaders; i++) { + NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls); + NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo); + } + + struct brw_compile_bs_params params = { + .nir = nir, + .key = &stage->key.bs, + .prog_data = &stage->prog_data.bs, + .num_resume_shaders = num_resume_shaders, + .resume_shaders = resume_shaders, + + .stats = stage->stats, + .log_data = pipeline->base.device, + }; + + stage->code = brw_compile_bs(compiler, mem_ctx, ¶ms); + if (stage->code == NULL) + return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* Ray-tracing shaders don't have a "real" bind map */ + struct anv_pipeline_bind_map empty_bind_map = {}; + + const unsigned code_size = stage->prog_data.base.program_size; + struct anv_shader_bin *bin = + anv_device_upload_kernel(pipeline->base.device, + cache, + stage->stage, + &stage->cache_key, sizeof(stage->cache_key), + stage->code, code_size, + &stage->prog_data.base, + sizeof(stage->prog_data.bs), + stage->stats, 1, + NULL, &empty_bind_map); + if (bin == NULL) + return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + + /* TODO: Figure out executables for resume shaders */ + anv_pipeline_add_executables(&pipeline->base, stage, bin); + util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, bin); + + *shader_out = bin; + + return VK_SUCCESS; +} + +static bool +is_rt_stack_size_dynamic(const VkRayTracingPipelineCreateInfoKHR *info) +{ + if (info->pDynamicState == NULL) + return false; + + for (unsigned i = 0; i < info->pDynamicState->dynamicStateCount; i++) { + if (info->pDynamicState->pDynamicStates[i] == + VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR) + return true; + } + + return false; +} + +static void +anv_pipeline_compute_ray_tracing_stacks(struct anv_ray_tracing_pipeline *pipeline, + const VkRayTracingPipelineCreateInfoKHR *info, + uint32_t *stack_max) +{ + if (is_rt_stack_size_dynamic(info)) { + pipeline->stack_size = 0; /* 0 means dynamic */ + } else { + /* From the Vulkan spec: + * + * "If the stack size is not set explicitly, the stack size for a + * pipeline is: + * + * rayGenStackMax + + * min(1, maxPipelineRayRecursionDepth) × + * max(closestHitStackMax, missStackMax, + * intersectionStackMax + anyHitStackMax) + + * max(0, maxPipelineRayRecursionDepth-1) × + * max(closestHitStackMax, missStackMax) + + * 2 × callableStackMax" + */ + pipeline->stack_size = + stack_max[MESA_SHADER_RAYGEN] + + MIN2(1, info->maxPipelineRayRecursionDepth) * + MAX4(stack_max[MESA_SHADER_CLOSEST_HIT], + stack_max[MESA_SHADER_MISS], + stack_max[MESA_SHADER_INTERSECTION], + stack_max[MESA_SHADER_ANY_HIT]) + + MAX2(0, (int)info->maxPipelineRayRecursionDepth - 1) * + MAX2(stack_max[MESA_SHADER_CLOSEST_HIT], + stack_max[MESA_SHADER_MISS]) + + 2 * stack_max[MESA_SHADER_CALLABLE]; + + /* This is an extremely unlikely case but we need to set it to some + * non-zero value so that we don't accidentally think it's dynamic. + * Our minimum stack size is 2KB anyway so we could set to any small + * value we like. + */ + if (pipeline->stack_size == 0) + pipeline->stack_size = 1; + } +} + +static struct anv_pipeline_stage * +anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline, + const VkRayTracingPipelineCreateInfoKHR *info, + void *pipeline_ctx) +{ + ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout); + + /* Create enough stage entries for all shader modules plus potential + * combinaisons in the groups. + */ + struct anv_pipeline_stage *stages = + rzalloc_array(pipeline_ctx, struct anv_pipeline_stage, info->stageCount); + + for (uint32_t i = 0; i < info->stageCount; i++) { + const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i]; + if (vk_pipeline_shader_stage_is_null(sinfo)) + continue; + + int64_t stage_start = os_time_get_nano(); + + stages[i] = (struct anv_pipeline_stage) { + .stage = vk_to_mesa_shader_stage(sinfo->stage), + .info = sinfo, + .cache_key = { + .stage = vk_to_mesa_shader_stage(sinfo->stage), + }, + .feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, + }, + }; + + populate_bs_prog_key(pipeline->base.device, + pipeline->base.device->robust_buffer_access, + &stages[i].key.bs); + + vk_pipeline_hash_shader_stage(sinfo, stages[i].shader_sha1); + + if (stages[i].stage != MESA_SHADER_INTERSECTION) { + anv_pipeline_hash_ray_tracing_shader(pipeline, layout, &stages[i], + stages[i].cache_key.sha1); + } + + stages[i].feedback.duration += os_time_get_nano() - stage_start; + } + + for (uint32_t i = 0; i < info->groupCount; i++) { + const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i]; + + if (ginfo->type != VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR) + continue; + + int64_t stage_start = os_time_get_nano(); + + uint32_t intersection_idx = ginfo->intersectionShader; + assert(intersection_idx < info->stageCount); + + uint32_t any_hit_idx = ginfo->anyHitShader; + if (any_hit_idx != VK_SHADER_UNUSED_KHR) { + assert(any_hit_idx < info->stageCount); + anv_pipeline_hash_ray_tracing_combined_shader(pipeline, + layout, + &stages[intersection_idx], + &stages[any_hit_idx], + stages[intersection_idx].cache_key.sha1); + } else { + anv_pipeline_hash_ray_tracing_shader(pipeline, layout, + &stages[intersection_idx], + stages[intersection_idx].cache_key.sha1); + } + + stages[intersection_idx].feedback.duration += os_time_get_nano() - stage_start; + } + + return stages; +} + +static bool +anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline, + struct vk_pipeline_cache *cache, + const VkRayTracingPipelineCreateInfoKHR *info, + struct anv_pipeline_stage *stages, + uint32_t *stack_max) +{ + uint32_t shaders = 0, cache_hits = 0; + for (uint32_t i = 0; i < info->stageCount; i++) { + if (stages[i].info == NULL) + continue; + + shaders++; + + int64_t stage_start = os_time_get_nano(); + + bool cache_hit; + stages[i].bin = anv_device_search_for_kernel(pipeline->base.device, cache, + &stages[i].cache_key, + sizeof(stages[i].cache_key), + &cache_hit); + if (cache_hit) { + cache_hits++; + stages[i].feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + } + + if (stages[i].bin != NULL) { + anv_pipeline_add_executables(&pipeline->base, &stages[i], stages[i].bin); + util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, stages[i].bin); + + uint32_t stack_size = + brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size; + stack_max[stages[i].stage] = + MAX2(stack_max[stages[i].stage], stack_size); + } + + stages[i].feedback.duration += os_time_get_nano() - stage_start; + } + + return cache_hits == shaders; +} + +static VkResult +anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline, + struct vk_pipeline_cache *cache, + const VkRayTracingPipelineCreateInfoKHR *info) +{ + const struct intel_device_info *devinfo = pipeline->base.device->info; + VkResult result; + + VkPipelineCreationFeedback pipeline_feedback = { + .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT, + }; + int64_t pipeline_start = os_time_get_nano(); + + void *pipeline_ctx = ralloc_context(NULL); + + struct anv_pipeline_stage *stages = + anv_pipeline_init_ray_tracing_stages(pipeline, info, pipeline_ctx); + + ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout); + + const bool skip_cache_lookup = + (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR); + + uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {}; + + if (!skip_cache_lookup && + anv_pipeline_load_cached_shaders(pipeline, cache, info, stages, stack_max)) { + pipeline_feedback.flags |= + VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT; + goto done; + } + + if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) { + ralloc_free(pipeline_ctx); + return VK_PIPELINE_COMPILE_REQUIRED; + } + + for (uint32_t i = 0; i < info->stageCount; i++) { + if (stages[i].info == NULL) + continue; + + int64_t stage_start = os_time_get_nano(); + + stages[i].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, + pipeline_ctx, &stages[i]); + if (stages[i].nir == NULL) { + ralloc_free(pipeline_ctx); + return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + anv_pipeline_lower_nir(&pipeline->base, pipeline_ctx, &stages[i], + layout, false /* use_primitive_replication */); + + stages[i].feedback.duration += os_time_get_nano() - stage_start; + } + + for (uint32_t i = 0; i < info->stageCount; i++) { + if (stages[i].info == NULL) + continue; + + /* Shader found in cache already. */ + if (stages[i].bin != NULL) + continue; + + /* We handle intersection shaders as part of the group */ + if (stages[i].stage == MESA_SHADER_INTERSECTION) + continue; + + int64_t stage_start = os_time_get_nano(); + + void *stage_ctx = ralloc_context(pipeline_ctx); + + nir_shader *nir = nir_shader_clone(stage_ctx, stages[i].nir); + switch (stages[i].stage) { + case MESA_SHADER_RAYGEN: + brw_nir_lower_raygen(nir); + break; + + case MESA_SHADER_ANY_HIT: + brw_nir_lower_any_hit(nir, devinfo); + break; + + case MESA_SHADER_CLOSEST_HIT: + brw_nir_lower_closest_hit(nir); + break; + + case MESA_SHADER_MISS: + brw_nir_lower_miss(nir); + break; + + case MESA_SHADER_INTERSECTION: + unreachable("These are handled later"); + + case MESA_SHADER_CALLABLE: + brw_nir_lower_callable(nir); + break; + + default: + unreachable("Invalid ray-tracing shader stage"); + } + + result = compile_upload_rt_shader(pipeline, cache, nir, &stages[i], + &stages[i].bin, stage_ctx); + if (result != VK_SUCCESS) { + ralloc_free(pipeline_ctx); + return result; + } + + uint32_t stack_size = + brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size; + stack_max[stages[i].stage] = MAX2(stack_max[stages[i].stage], stack_size); + + ralloc_free(stage_ctx); + + stages[i].feedback.duration += os_time_get_nano() - stage_start; + } + + for (uint32_t i = 0; i < info->groupCount; i++) { + const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i]; + struct anv_rt_shader_group *group = &pipeline->groups[i]; + group->type = ginfo->type; + switch (ginfo->type) { + case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: + assert(ginfo->generalShader < info->stageCount); + group->general = stages[ginfo->generalShader].bin; + break; + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: + if (ginfo->anyHitShader < info->stageCount) + group->any_hit = stages[ginfo->anyHitShader].bin; + + if (ginfo->closestHitShader < info->stageCount) + group->closest_hit = stages[ginfo->closestHitShader].bin; + break; + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: { + if (ginfo->closestHitShader < info->stageCount) + group->closest_hit = stages[ginfo->closestHitShader].bin; + + uint32_t intersection_idx = info->pGroups[i].intersectionShader; + assert(intersection_idx < info->stageCount); + + /* Only compile this stage if not already found in the cache. */ + if (stages[intersection_idx].bin == NULL) { + /* The any-hit and intersection shader have to be combined */ + uint32_t any_hit_idx = info->pGroups[i].anyHitShader; + const nir_shader *any_hit = NULL; + if (any_hit_idx < info->stageCount) + any_hit = stages[any_hit_idx].nir; + + void *group_ctx = ralloc_context(pipeline_ctx); + nir_shader *intersection = + nir_shader_clone(group_ctx, stages[intersection_idx].nir); + + brw_nir_lower_combined_intersection_any_hit(intersection, any_hit, + devinfo); + + result = compile_upload_rt_shader(pipeline, cache, + intersection, + &stages[intersection_idx], + &group->intersection, + group_ctx); + ralloc_free(group_ctx); + if (result != VK_SUCCESS) + return result; + } else { + group->intersection = stages[intersection_idx].bin; + } + + uint32_t stack_size = + brw_bs_prog_data_const(group->intersection->prog_data)->max_stack_size; + stack_max[MESA_SHADER_INTERSECTION] = + MAX2(stack_max[MESA_SHADER_INTERSECTION], stack_size); + + break; + } + + default: + unreachable("Invalid ray tracing shader group type"); + } + } + + done: + ralloc_free(pipeline_ctx); + + anv_pipeline_compute_ray_tracing_stacks(pipeline, info, stack_max); + + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; + + const VkPipelineCreationFeedbackCreateInfo *create_feedback = + vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO); + if (create_feedback) { + *create_feedback->pPipelineCreationFeedback = pipeline_feedback; + + assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount); + for (uint32_t i = 0; i < info->stageCount; i++) { + gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage); + create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback; + } + } + + return VK_SUCCESS; +} + +VkResult +anv_device_init_rt_shaders(struct anv_device *device) +{ + if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline) + return VK_SUCCESS; + + bool cache_hit; + + struct brw_rt_trampoline { + char name[16]; + struct brw_cs_prog_key key; + } trampoline_key = { + .name = "rt-trampoline", + }; + device->rt_trampoline = + anv_device_search_for_kernel(device, device->internal_cache, + &trampoline_key, sizeof(trampoline_key), + &cache_hit); + if (device->rt_trampoline == NULL) { + + void *tmp_ctx = ralloc_context(NULL); + nir_shader *trampoline_nir = + brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx); + + trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_8; + + struct anv_pipeline_bind_map bind_map = { + .surface_count = 0, + .sampler_count = 0, + }; + uint32_t dummy_params[4] = { 0, }; + struct brw_cs_prog_data trampoline_prog_data = { + .base.nr_params = 4, + .base.param = dummy_params, + .uses_inline_data = true, + .uses_btd_stack_ids = true, + }; + struct brw_compile_cs_params params = { + .nir = trampoline_nir, + .key = &trampoline_key.key, + .prog_data = &trampoline_prog_data, + .log_data = device, + }; + const unsigned *tramp_data = + brw_compile_cs(device->physical->compiler, tmp_ctx, ¶ms); + + device->rt_trampoline = + anv_device_upload_kernel(device, device->internal_cache, + MESA_SHADER_COMPUTE, + &trampoline_key, sizeof(trampoline_key), + tramp_data, + trampoline_prog_data.base.program_size, + &trampoline_prog_data.base, + sizeof(trampoline_prog_data), + NULL, 0, NULL, &bind_map); + + ralloc_free(tmp_ctx); + + if (device->rt_trampoline == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, device->rt_trampoline); + + struct brw_rt_trivial_return { + char name[16]; + struct brw_bs_prog_key key; + } return_key = { + .name = "rt-trivial-ret", + }; + device->rt_trivial_return = + anv_device_search_for_kernel(device, device->internal_cache, + &return_key, sizeof(return_key), + &cache_hit); + if (device->rt_trivial_return == NULL) { + void *tmp_ctx = ralloc_context(NULL); + nir_shader *trivial_return_nir = + brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx); + + NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info); + + struct anv_pipeline_bind_map bind_map = { + .surface_count = 0, + .sampler_count = 0, + }; + struct brw_bs_prog_data return_prog_data = { 0, }; + struct brw_compile_bs_params params = { + .nir = trivial_return_nir, + .key = &return_key.key, + .prog_data = &return_prog_data, + + .log_data = device, + }; + const unsigned *return_data = + brw_compile_bs(device->physical->compiler, tmp_ctx, ¶ms); + + device->rt_trivial_return = + anv_device_upload_kernel(device, device->internal_cache, + MESA_SHADER_CALLABLE, + &return_key, sizeof(return_key), + return_data, return_prog_data.base.program_size, + &return_prog_data.base, sizeof(return_prog_data), + NULL, 0, NULL, &bind_map); + + ralloc_free(tmp_ctx); + + if (device->rt_trivial_return == NULL) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + /* The cache already has a reference and it's not going anywhere so there + * is no need to hold a second reference. + */ + anv_shader_bin_unref(device, device->rt_trivial_return); + + return VK_SUCCESS; +} + +void +anv_device_finish_rt_shaders(struct anv_device *device) +{ + if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline) + return; +} + +static VkResult +anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline, + struct anv_device *device, + struct vk_pipeline_cache *cache, + const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + const VkAllocationCallbacks *alloc) +{ + VkResult result; + + util_dynarray_init(&pipeline->shaders, pipeline->base.mem_ctx); + + result = anv_pipeline_compile_ray_tracing(pipeline, cache, pCreateInfo); + if (result != VK_SUCCESS) + goto fail; + + anv_pipeline_setup_l3_config(&pipeline->base, /* needs_slm */ false); + + return VK_SUCCESS; + +fail: + util_dynarray_foreach(&pipeline->shaders, + struct anv_shader_bin *, shader) { + anv_shader_bin_unref(device, *shader); + } + return result; +} + +static void +assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo, + uint32_t stage_idx, + VkShaderStageFlags valid_stages) +{ + if (stage_idx == VK_SHADER_UNUSED_KHR) + return; + + assert(stage_idx <= pCreateInfo->stageCount); + assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1); + assert(pCreateInfo->pStages[stage_idx].stage & valid_stages); +} + +static VkResult +anv_ray_tracing_pipeline_create( + VkDevice _device, + struct vk_pipeline_cache * cache, + const VkRayTracingPipelineCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipeline) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR); + + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1); + VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount); + if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + result = anv_pipeline_init(&pipeline->base, device, + ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags, + pAllocator); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + pipeline->group_count = pCreateInfo->groupCount; + pipeline->groups = groups; + + ASSERTED const VkShaderStageFlags ray_tracing_stages = + VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_ANY_HIT_BIT_KHR | + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_INTERSECTION_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR; + + for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) + assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0); + + for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) { + const VkRayTracingShaderGroupCreateInfoKHR *ginfo = + &pCreateInfo->pGroups[i]; + assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader, + VK_SHADER_STAGE_RAYGEN_BIT_KHR | + VK_SHADER_STAGE_MISS_BIT_KHR | + VK_SHADER_STAGE_CALLABLE_BIT_KHR); + assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader, + VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR); + assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader, + VK_SHADER_STAGE_ANY_HIT_BIT_KHR); + assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader, + VK_SHADER_STAGE_INTERSECTION_BIT_KHR); + switch (ginfo->type) { + case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: + assert(ginfo->generalShader < pCreateInfo->stageCount); + assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR); + assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR); + assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR); + break; + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: + assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR); + assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR); + break; + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: + assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR); + break; + + default: + unreachable("Invalid ray-tracing shader group type"); + } + } + + result = anv_ray_tracing_pipeline_init(pipeline, device, cache, + pCreateInfo, pAllocator); + if (result != VK_SUCCESS) { + anv_pipeline_finish(&pipeline->base, device, pAllocator); + vk_free2(&device->vk.alloc, pAllocator, pipeline); + return result; + } + + anv_genX(device->info, ray_tracing_pipeline_emit)(pipeline); + + *pPipeline = anv_pipeline_to_handle(&pipeline->base); + + return pipeline->base.batch.status; +} + +VkResult +anv_CreateRayTracingPipelinesKHR( + VkDevice _device, + VkDeferredOperationKHR deferredOperation, + VkPipelineCache pipelineCache, + uint32_t createInfoCount, + const VkRayTracingPipelineCreateInfoKHR* pCreateInfos, + const VkAllocationCallbacks* pAllocator, + VkPipeline* pPipelines) +{ + ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache); + + VkResult result = VK_SUCCESS; + + unsigned i; + for (i = 0; i < createInfoCount; i++) { + VkResult res = anv_ray_tracing_pipeline_create(_device, pipeline_cache, + &pCreateInfos[i], + pAllocator, &pPipelines[i]); + + if (res == VK_SUCCESS) + continue; + + /* Bail out on the first error as it is not obvious what error should be + * report upon 2 different failures. */ + result = res; + if (result != VK_PIPELINE_COMPILE_REQUIRED) + break; + + pPipelines[i] = VK_NULL_HANDLE; + + if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT) + break; + } + + for (; i < createInfoCount; i++) + pPipelines[i] = VK_NULL_HANDLE; + + return result; +} + +#define WRITE_STR(field, ...) ({ \ + memset(field, 0, sizeof(field)); \ + UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \ + assert(i > 0 && i < sizeof(field)); \ +}) + +VkResult anv_GetPipelineExecutablePropertiesKHR( + VkDevice device, + const VkPipelineInfoKHR* pPipelineInfo, + uint32_t* pExecutableCount, + VkPipelineExecutablePropertiesKHR* pProperties) +{ + ANV_FROM_HANDLE(anv_pipeline, pipeline, pPipelineInfo->pipeline); + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, + pProperties, pExecutableCount); + + util_dynarray_foreach (&pipeline->executables, struct anv_pipeline_executable, exe) { + vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) { + gl_shader_stage stage = exe->stage; + props->stages = mesa_to_vk_shader_stage(stage); + + unsigned simd_width = exe->stats.dispatch_width; + if (stage == MESA_SHADER_FRAGMENT) { + WRITE_STR(props->name, "%s%d %s", + simd_width ? "SIMD" : "vec", + simd_width ? simd_width : 4, + _mesa_shader_stage_to_string(stage)); + } else { + WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(stage)); + } + WRITE_STR(props->description, "%s%d %s shader", + simd_width ? "SIMD" : "vec", + simd_width ? simd_width : 4, + _mesa_shader_stage_to_string(stage)); + + /* The compiler gives us a dispatch width of 0 for vec4 but Vulkan + * wants a subgroup size of 1. + */ + props->subgroupSize = MAX2(simd_width, 1); + } + } + + return vk_outarray_status(&out); +} + +static const struct anv_pipeline_executable * +anv_pipeline_get_executable(struct anv_pipeline *pipeline, uint32_t index) +{ + assert(index < util_dynarray_num_elements(&pipeline->executables, + struct anv_pipeline_executable)); + return util_dynarray_element( + &pipeline->executables, struct anv_pipeline_executable, index); +} + +VkResult anv_GetPipelineExecutableStatisticsKHR( + VkDevice device, + const VkPipelineExecutableInfoKHR* pExecutableInfo, + uint32_t* pStatisticCount, + VkPipelineExecutableStatisticKHR* pStatistics) +{ + ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline); + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, + pStatistics, pStatisticCount); + + const struct anv_pipeline_executable *exe = + anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); + + const struct brw_stage_prog_data *prog_data; + switch (pipeline->type) { + case ANV_PIPELINE_GRAPHICS: { + prog_data = anv_pipeline_to_graphics(pipeline)->shaders[exe->stage]->prog_data; + break; + } + case ANV_PIPELINE_COMPUTE: { + prog_data = anv_pipeline_to_compute(pipeline)->cs->prog_data; + break; + } + case ANV_PIPELINE_RAY_TRACING: { + struct anv_shader_bin **shader = + util_dynarray_element(&anv_pipeline_to_ray_tracing(pipeline)->shaders, + struct anv_shader_bin *, + pExecutableInfo->executableIndex); + prog_data = (*shader)->prog_data; + break; + } + default: + unreachable("invalid pipeline type"); + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Instruction Count"); + WRITE_STR(stat->description, + "Number of GEN instructions in the final generated " + "shader executable."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.instructions; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "SEND Count"); + WRITE_STR(stat->description, + "Number of instructions in the final generated shader " + "executable which access external units such as the " + "constant cache or the sampler."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.sends; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Loop Count"); + WRITE_STR(stat->description, + "Number of loops (not unrolled) in the final generated " + "shader executable."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.loops; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Cycle Count"); + WRITE_STR(stat->description, + "Estimate of the number of EU cycles required to execute " + "the final generated executable. This is an estimate only " + "and may vary greatly from actual run-time performance."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.cycles; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Spill Count"); + WRITE_STR(stat->description, + "Number of scratch spill operations. This gives a rough " + "estimate of the cost incurred due to spilling temporary " + "values to memory. If this is non-zero, you may want to " + "adjust your shader to reduce register pressure."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.spills; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Fill Count"); + WRITE_STR(stat->description, + "Number of scratch fill operations. This gives a rough " + "estimate of the cost incurred due to spilling temporary " + "values to memory. If this is non-zero, you may want to " + "adjust your shader to reduce register pressure."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = exe->stats.fills; + } + + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Scratch Memory Size"); + WRITE_STR(stat->description, + "Number of bytes of scratch memory required by the " + "generated shader executable. If this is non-zero, you " + "may want to adjust your shader to reduce register " + "pressure."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->total_scratch; + } + + if (gl_shader_stage_uses_workgroup(exe->stage)) { + vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { + WRITE_STR(stat->name, "Workgroup Memory Size"); + WRITE_STR(stat->description, + "Number of bytes of workgroup shared memory used by this " + "shader including any padding."); + stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; + stat->value.u64 = prog_data->total_shared; + } + } + + return vk_outarray_status(&out); +} + +static bool +write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, + const char *data) +{ + ir->isText = VK_TRUE; + + size_t data_len = strlen(data) + 1; + + if (ir->pData == NULL) { + ir->dataSize = data_len; + return true; + } + + strncpy(ir->pData, data, ir->dataSize); + if (ir->dataSize < data_len) + return false; + + ir->dataSize = data_len; + return true; +} + +VkResult anv_GetPipelineExecutableInternalRepresentationsKHR( + VkDevice device, + const VkPipelineExecutableInfoKHR* pExecutableInfo, + uint32_t* pInternalRepresentationCount, + VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations) +{ + ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline); + VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, + pInternalRepresentations, pInternalRepresentationCount); + bool incomplete_text = false; + + const struct anv_pipeline_executable *exe = + anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex); + + if (exe->nir) { + vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { + WRITE_STR(ir->name, "Final NIR"); + WRITE_STR(ir->description, + "Final NIR before going into the back-end compiler"); + + if (!write_ir_text(ir, exe->nir)) + incomplete_text = true; + } + } + + if (exe->disasm) { + vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { + WRITE_STR(ir->name, "GEN Assembly"); + WRITE_STR(ir->description, + "Final GEN assembly for the generated shader binary"); + + if (!write_ir_text(ir, exe->disasm)) + incomplete_text = true; + } + } + + return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); +} + +VkResult +anv_GetRayTracingShaderGroupHandlesKHR( + VkDevice _device, + VkPipeline _pipeline, + uint32_t firstGroup, + uint32_t groupCount, + size_t dataSize, + void* pData) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + + if (pipeline->type != ANV_PIPELINE_RAY_TRACING) + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); + + struct anv_ray_tracing_pipeline *rt_pipeline = + anv_pipeline_to_ray_tracing(pipeline); + + for (uint32_t i = 0; i < groupCount; i++) { + struct anv_rt_shader_group *group = &rt_pipeline->groups[firstGroup + i]; + memcpy(pData, group->handle, sizeof(group->handle)); + pData += sizeof(group->handle); + } + + return VK_SUCCESS; +} + +VkResult +anv_GetRayTracingCaptureReplayShaderGroupHandlesKHR( + VkDevice _device, + VkPipeline pipeline, + uint32_t firstGroup, + uint32_t groupCount, + size_t dataSize, + void* pData) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + unreachable("Unimplemented"); + return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT); +} + +VkDeviceSize +anv_GetRayTracingShaderGroupStackSizeKHR( + VkDevice device, + VkPipeline _pipeline, + uint32_t group, + VkShaderGroupShaderKHR groupShader) +{ + ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline); + assert(pipeline->type == ANV_PIPELINE_RAY_TRACING); + + struct anv_ray_tracing_pipeline *rt_pipeline = + anv_pipeline_to_ray_tracing(pipeline); + + assert(group < rt_pipeline->group_count); + + struct anv_shader_bin *bin; + switch (groupShader) { + case VK_SHADER_GROUP_SHADER_GENERAL_KHR: + bin = rt_pipeline->groups[group].general; + break; + + case VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR: + bin = rt_pipeline->groups[group].closest_hit; + break; + + case VK_SHADER_GROUP_SHADER_ANY_HIT_KHR: + bin = rt_pipeline->groups[group].any_hit; + break; + + case VK_SHADER_GROUP_SHADER_INTERSECTION_KHR: + bin = rt_pipeline->groups[group].intersection; + break; + + default: + unreachable("Invalid VkShaderGroupShader enum"); + } + + if (bin == NULL) + return 0; + + return brw_bs_prog_data_const(bin->prog_data)->max_stack_size; +} diff --git a/src/intel/vulkan_hasvk/anv_pipeline_cache.c b/src/intel/vulkan_hasvk/anv_pipeline_cache.c new file mode 100644 index 00000000000..e85a362f7f4 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_pipeline_cache.c @@ -0,0 +1,380 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/blob.h" +#include "util/hash_table.h" +#include "util/debug.h" +#include "util/disk_cache.h" +#include "util/mesa-sha1.h" +#include "nir/nir_serialize.h" +#include "anv_private.h" +#include "nir/nir_xfb_info.h" +#include "vulkan/util/vk_util.h" + +static bool +anv_shader_bin_serialize(struct vk_pipeline_cache_object *object, + struct blob *blob); + +struct vk_pipeline_cache_object * +anv_shader_bin_deserialize(struct vk_device *device, + const void *key_data, size_t key_size, + struct blob_reader *blob); + +static void +anv_shader_bin_destroy(struct vk_pipeline_cache_object *object) +{ + struct anv_device *device = + container_of(object->device, struct anv_device, vk); + struct anv_shader_bin *shader = + container_of(object, struct anv_shader_bin, base); + + anv_state_pool_free(&device->instruction_state_pool, shader->kernel); + vk_pipeline_cache_object_finish(&shader->base); + vk_free(&device->vk.alloc, shader); +} + +static const struct vk_pipeline_cache_object_ops anv_shader_bin_ops = { + .serialize = anv_shader_bin_serialize, + .deserialize = anv_shader_bin_deserialize, + .destroy = anv_shader_bin_destroy, +}; + +const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2] = { + &anv_shader_bin_ops, + NULL +}; + +struct anv_shader_bin * +anv_shader_bin_create(struct anv_device *device, + gl_shader_stage stage, + const void *key_data, uint32_t key_size, + const void *kernel_data, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data_in, + uint32_t prog_data_size, + const struct brw_compile_stats *stats, uint32_t num_stats, + const nir_xfb_info *xfb_info_in, + const struct anv_pipeline_bind_map *bind_map) +{ + VK_MULTIALLOC(ma); + VK_MULTIALLOC_DECL(&ma, struct anv_shader_bin, shader, 1); + VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size); + VK_MULTIALLOC_DECL_SIZE(&ma, struct brw_stage_prog_data, prog_data, + prog_data_size); + VK_MULTIALLOC_DECL(&ma, struct brw_shader_reloc, prog_data_relocs, + prog_data_in->num_relocs); + VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params); + + VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info, + xfb_info_in == NULL ? 0 : + nir_xfb_info_size(xfb_info_in->output_count)); + + VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surface_to_descriptor, + bind_map->surface_count); + VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, sampler_to_descriptor, + bind_map->sampler_count); + + if (!vk_multialloc_alloc(&ma, &device->vk.alloc, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE)) + return NULL; + + memcpy(obj_key_data, key_data, key_size); + vk_pipeline_cache_object_init(&device->vk, &shader->base, + &anv_shader_bin_ops, obj_key_data, key_size); + + shader->stage = stage; + + shader->kernel = + anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64); + memcpy(shader->kernel.map, kernel_data, kernel_size); + shader->kernel_size = kernel_size; + + uint64_t shader_data_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS + + shader->kernel.offset + + prog_data_in->const_data_offset; + + int rv_count = 0; + struct brw_shader_reloc_value reloc_values[5]; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, + .value = shader_data_addr, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, + .value = shader_data_addr >> 32, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_SHADER_START_OFFSET, + .value = shader->kernel.offset, + }; + if (brw_shader_stage_is_bindless(stage)) { + const struct brw_bs_prog_data *bs_prog_data = + brw_bs_prog_data_const(prog_data_in); + uint64_t resume_sbt_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS + + shader->kernel.offset + + bs_prog_data->resume_sbt_offset; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW, + .value = resume_sbt_addr, + }; + reloc_values[rv_count++] = (struct brw_shader_reloc_value) { + .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, + .value = resume_sbt_addr >> 32, + }; + } + + brw_write_shader_relocs(&device->physical->compiler->isa, + shader->kernel.map, prog_data_in, + reloc_values, rv_count); + + memcpy(prog_data, prog_data_in, prog_data_size); + typed_memcpy(prog_data_relocs, prog_data_in->relocs, + prog_data_in->num_relocs); + prog_data->relocs = prog_data_relocs; + memset(prog_data_param, 0, + prog_data->nr_params * sizeof(*prog_data_param)); + prog_data->param = prog_data_param; + shader->prog_data = prog_data; + shader->prog_data_size = prog_data_size; + + assert(num_stats <= ARRAY_SIZE(shader->stats)); + typed_memcpy(shader->stats, stats, num_stats); + shader->num_stats = num_stats; + + if (xfb_info_in) { + *xfb_info = *xfb_info_in; + typed_memcpy(xfb_info->outputs, xfb_info_in->outputs, + xfb_info_in->output_count); + shader->xfb_info = xfb_info; + } else { + shader->xfb_info = NULL; + } + + shader->bind_map = *bind_map; + typed_memcpy(surface_to_descriptor, bind_map->surface_to_descriptor, + bind_map->surface_count); + shader->bind_map.surface_to_descriptor = surface_to_descriptor; + typed_memcpy(sampler_to_descriptor, bind_map->sampler_to_descriptor, + bind_map->sampler_count); + shader->bind_map.sampler_to_descriptor = sampler_to_descriptor; + + return shader; +} + +static bool +anv_shader_bin_serialize(struct vk_pipeline_cache_object *object, + struct blob *blob) +{ + struct anv_shader_bin *shader = + container_of(object, struct anv_shader_bin, base); + + blob_write_uint32(blob, shader->stage); + + blob_write_uint32(blob, shader->kernel_size); + blob_write_bytes(blob, shader->kernel.map, shader->kernel_size); + + blob_write_uint32(blob, shader->prog_data_size); + blob_write_bytes(blob, shader->prog_data, shader->prog_data_size); + blob_write_bytes(blob, shader->prog_data->relocs, + shader->prog_data->num_relocs * + sizeof(shader->prog_data->relocs[0])); + + blob_write_uint32(blob, shader->num_stats); + blob_write_bytes(blob, shader->stats, + shader->num_stats * sizeof(shader->stats[0])); + + if (shader->xfb_info) { + uint32_t xfb_info_size = + nir_xfb_info_size(shader->xfb_info->output_count); + blob_write_uint32(blob, xfb_info_size); + blob_write_bytes(blob, shader->xfb_info, xfb_info_size); + } else { + blob_write_uint32(blob, 0); + } + + blob_write_bytes(blob, shader->bind_map.surface_sha1, + sizeof(shader->bind_map.surface_sha1)); + blob_write_bytes(blob, shader->bind_map.sampler_sha1, + sizeof(shader->bind_map.sampler_sha1)); + blob_write_bytes(blob, shader->bind_map.push_sha1, + sizeof(shader->bind_map.push_sha1)); + blob_write_uint32(blob, shader->bind_map.surface_count); + blob_write_uint32(blob, shader->bind_map.sampler_count); + blob_write_bytes(blob, shader->bind_map.surface_to_descriptor, + shader->bind_map.surface_count * + sizeof(*shader->bind_map.surface_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor, + shader->bind_map.sampler_count * + sizeof(*shader->bind_map.sampler_to_descriptor)); + blob_write_bytes(blob, shader->bind_map.push_ranges, + sizeof(shader->bind_map.push_ranges)); + + return !blob->out_of_memory; +} + +struct vk_pipeline_cache_object * +anv_shader_bin_deserialize(struct vk_device *vk_device, + const void *key_data, size_t key_size, + struct blob_reader *blob) +{ + struct anv_device *device = + container_of(vk_device, struct anv_device, vk); + + gl_shader_stage stage = blob_read_uint32(blob); + + uint32_t kernel_size = blob_read_uint32(blob); + const void *kernel_data = blob_read_bytes(blob, kernel_size); + + uint32_t prog_data_size = blob_read_uint32(blob); + const void *prog_data_bytes = blob_read_bytes(blob, prog_data_size); + if (blob->overrun) + return NULL; + + union brw_any_prog_data prog_data; + memcpy(&prog_data, prog_data_bytes, + MIN2(sizeof(prog_data), prog_data_size)); + prog_data.base.relocs = + blob_read_bytes(blob, prog_data.base.num_relocs * + sizeof(prog_data.base.relocs[0])); + + uint32_t num_stats = blob_read_uint32(blob); + const struct brw_compile_stats *stats = + blob_read_bytes(blob, num_stats * sizeof(stats[0])); + + const nir_xfb_info *xfb_info = NULL; + uint32_t xfb_size = blob_read_uint32(blob); + if (xfb_size) + xfb_info = blob_read_bytes(blob, xfb_size); + + struct anv_pipeline_bind_map bind_map; + blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1)); + blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1)); + blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1)); + bind_map.surface_count = blob_read_uint32(blob); + bind_map.sampler_count = blob_read_uint32(blob); + bind_map.surface_to_descriptor = (void *) + blob_read_bytes(blob, bind_map.surface_count * + sizeof(*bind_map.surface_to_descriptor)); + bind_map.sampler_to_descriptor = (void *) + blob_read_bytes(blob, bind_map.sampler_count * + sizeof(*bind_map.sampler_to_descriptor)); + blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges)); + + if (blob->overrun) + return NULL; + + struct anv_shader_bin *shader = + anv_shader_bin_create(device, stage, + key_data, key_size, + kernel_data, kernel_size, + &prog_data.base, prog_data_size, + stats, num_stats, xfb_info, &bind_map); + if (shader == NULL) + return NULL; + + return &shader->base; +} + +struct anv_shader_bin * +anv_device_search_for_kernel(struct anv_device *device, + struct vk_pipeline_cache *cache, + const void *key_data, uint32_t key_size, + bool *user_cache_hit) +{ + /* Use the default pipeline cache if none is specified */ + if (cache == NULL) + cache = device->default_pipeline_cache; + + bool cache_hit = false; + struct vk_pipeline_cache_object *object = + vk_pipeline_cache_lookup_object(cache, key_data, key_size, + &anv_shader_bin_ops, &cache_hit); + if (user_cache_hit != NULL) { + *user_cache_hit = object != NULL && cache_hit && + cache != device->default_pipeline_cache; + } + if (object == NULL) + return NULL; + + return container_of(object, struct anv_shader_bin, base); +} + +struct anv_shader_bin * +anv_device_upload_kernel(struct anv_device *device, + struct vk_pipeline_cache *cache, + gl_shader_stage stage, + const void *key_data, uint32_t key_size, + const void *kernel_data, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + const struct brw_compile_stats *stats, + uint32_t num_stats, + const nir_xfb_info *xfb_info, + const struct anv_pipeline_bind_map *bind_map) +{ + /* Use the default pipeline cache if none is specified */ + if (cache == NULL) + cache = device->default_pipeline_cache; + + struct anv_shader_bin *shader = + anv_shader_bin_create(device, stage, + key_data, key_size, + kernel_data, kernel_size, + prog_data, prog_data_size, + stats, num_stats, + xfb_info, bind_map); + if (shader == NULL) + return NULL; + + struct vk_pipeline_cache_object *cached = + vk_pipeline_cache_add_object(cache, &shader->base); + + return container_of(cached, struct anv_shader_bin, base); +} + +#define SHA1_KEY_SIZE 20 + +struct nir_shader * +anv_device_search_for_nir(struct anv_device *device, + struct vk_pipeline_cache *cache, + const nir_shader_compiler_options *nir_options, + unsigned char sha1_key[SHA1_KEY_SIZE], + void *mem_ctx) +{ + if (cache == NULL) + cache = device->default_pipeline_cache; + + return vk_pipeline_cache_lookup_nir(cache, sha1_key, SHA1_KEY_SIZE, + nir_options, NULL, mem_ctx); +} + +void +anv_device_upload_nir(struct anv_device *device, + struct vk_pipeline_cache *cache, + const struct nir_shader *nir, + unsigned char sha1_key[SHA1_KEY_SIZE]) +{ + if (cache == NULL) + cache = device->default_pipeline_cache; + + vk_pipeline_cache_add_nir(cache, sha1_key, SHA1_KEY_SIZE, nir); +} diff --git a/src/intel/vulkan_hasvk/anv_private.h b/src/intel/vulkan_hasvk/anv_private.h new file mode 100644 index 00000000000..c00c4565142 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_private.h @@ -0,0 +1,4303 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef ANV_PRIVATE_H +#define ANV_PRIVATE_H + +#include +#include +#include +#include +#include +#include +#include "drm-uapi/i915_drm.h" +#include "drm-uapi/drm_fourcc.h" + +#ifdef HAVE_VALGRIND +#include +#include +#define VG(x) x +#else +#define VG(x) ((void)0) +#endif + +#include "common/intel_clflush.h" +#include "common/intel_decoder.h" +#include "common/intel_gem.h" +#include "common/intel_l3_config.h" +#include "common/intel_measure.h" +#include "common/intel_sample_positions.h" +#include "dev/intel_device_info.h" +#include "blorp/blorp.h" +#include "compiler/brw_compiler.h" +#include "compiler/brw_rt.h" +#include "ds/intel_driver_ds.h" +#include "util/bitset.h" +#include "util/bitscan.h" +#include "util/macros.h" +#include "util/hash_table.h" +#include "util/list.h" +#include "util/perf/u_trace.h" +#include "util/sparse_array.h" +#include "util/u_atomic.h" +#include "util/u_vector.h" +#include "util/u_math.h" +#include "util/vma.h" +#include "util/xmlconfig.h" +#include "vk_alloc.h" +#include "vk_buffer.h" +#include "vk_command_buffer.h" +#include "vk_command_pool.h" +#include "vk_debug_report.h" +#include "vk_device.h" +#include "vk_drm_syncobj.h" +#include "vk_enum_defines.h" +#include "vk_framebuffer.h" +#include "vk_graphics_state.h" +#include "vk_image.h" +#include "vk_instance.h" +#include "vk_pipeline_cache.h" +#include "vk_physical_device.h" +#include "vk_shader_module.h" +#include "vk_sync.h" +#include "vk_sync_timeline.h" +#include "vk_util.h" +#include "vk_queue.h" +#include "vk_log.h" + +/* Pre-declarations needed for WSI entrypoints */ +struct wl_surface; +struct wl_display; +typedef struct xcb_connection_t xcb_connection_t; +typedef uint32_t xcb_visualid_t; +typedef uint32_t xcb_window_t; + +struct anv_batch; +struct anv_buffer; +struct anv_buffer_view; +struct anv_image_view; +struct anv_acceleration_structure; +struct anv_instance; + +struct intel_aux_map_context; +struct intel_perf_config; +struct intel_perf_counter_pass; +struct intel_perf_query_result; + +#include +#include + +#include "anv_android.h" +#include "anv_entrypoints.h" +#include "isl/isl.h" + +#include "dev/intel_debug.h" +#undef MESA_LOG_TAG +#define MESA_LOG_TAG "MESA-INTEL" +#include "util/log.h" +#include "wsi_common.h" + +#define NSEC_PER_SEC 1000000000ull + +/* anv Virtual Memory Layout + * ========================= + * + * When the anv driver is determining the virtual graphics addresses of memory + * objects itself using the softpin mechanism, the following memory ranges + * will be used. + * + * Three special considerations to notice: + * + * (1) the dynamic state pool is located within the same 4 GiB as the low + * heap. This is to work around a VF cache issue described in a comment in + * anv_physical_device_init_heaps. + * + * (2) the binding table pool is located at lower addresses than the surface + * state pool, within a 4 GiB range. This allows surface state base addresses + * to cover both binding tables (16 bit offsets) and surface states (32 bit + * offsets). + * + * (3) the last 4 GiB of the address space is withheld from the high + * heap. Various hardware units will read past the end of an object for + * various reasons. This healthy margin prevents reads from wrapping around + * 48-bit addresses. + */ +#define GENERAL_STATE_POOL_MIN_ADDRESS 0x000000200000ULL /* 2 MiB */ +#define GENERAL_STATE_POOL_MAX_ADDRESS 0x00003fffffffULL +#define LOW_HEAP_MIN_ADDRESS 0x000040000000ULL /* 1 GiB */ +#define LOW_HEAP_MAX_ADDRESS 0x00007fffffffULL +#define DYNAMIC_STATE_POOL_MIN_ADDRESS 0x0000c0000000ULL /* 3 GiB */ +#define DYNAMIC_STATE_POOL_MAX_ADDRESS 0x0000ffffffffULL +#define BINDING_TABLE_POOL_MIN_ADDRESS 0x000100000000ULL /* 4 GiB */ +#define BINDING_TABLE_POOL_MAX_ADDRESS 0x00013fffffffULL +#define SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB */ +#define SURFACE_STATE_POOL_MAX_ADDRESS 0x00017fffffffULL +#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */ +#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL +#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ +#define CLIENT_VISIBLE_HEAP_MAX_ADDRESS 0x0002bfffffffULL +#define HIGH_HEAP_MIN_ADDRESS 0x0002c0000000ULL /* 11 GiB */ + +#define GENERAL_STATE_POOL_SIZE \ + (GENERAL_STATE_POOL_MAX_ADDRESS - GENERAL_STATE_POOL_MIN_ADDRESS + 1) +#define LOW_HEAP_SIZE \ + (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1) +#define DYNAMIC_STATE_POOL_SIZE \ + (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1) +#define BINDING_TABLE_POOL_SIZE \ + (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1) +#define BINDING_TABLE_POOL_BLOCK_SIZE (65536) +#define SURFACE_STATE_POOL_SIZE \ + (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1) +#define INSTRUCTION_STATE_POOL_SIZE \ + (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1) +#define CLIENT_VISIBLE_HEAP_SIZE \ + (CLIENT_VISIBLE_HEAP_MAX_ADDRESS - CLIENT_VISIBLE_HEAP_MIN_ADDRESS + 1) + +/* Allowing different clear colors requires us to perform a depth resolve at + * the end of certain render passes. This is because while slow clears store + * the clear color in the HiZ buffer, fast clears (without a resolve) don't. + * See the PRMs for examples describing when additional resolves would be + * necessary. To enable fast clears without requiring extra resolves, we set + * the clear value to a globally-defined one. We could allow different values + * if the user doesn't expect coherent data during or after a render passes + * (VK_ATTACHMENT_STORE_OP_DONT_CARE), but such users (aside from the CTS) + * don't seem to exist yet. In almost all Vulkan applications tested thus far, + * 1.0f seems to be the only value used. The only application that doesn't set + * this value does so through the usage of an seemingly uninitialized clear + * value. + */ +#define ANV_HZ_FC_VAL 1.0f + +/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, we use 2 for base & drawid SGVs */ +#define MAX_VBS (33 - 2) + +/* 3DSTATE_VERTEX_ELEMENTS supports up to 34 VEs, but our backend compiler + * only supports the push model of VS inputs, and we only have 128 GRFs, + * minus the g0 and g1 payload, which gives us a maximum of 31 VEs. Plus, + * we use two of them for SGVs. + */ +#define MAX_VES (31 - 2) + +#define MAX_XFB_BUFFERS 4 +#define MAX_XFB_STREAMS 4 +#define MAX_SETS 32 +#define MAX_RTS 8 +#define MAX_VIEWPORTS 16 +#define MAX_SCISSORS 16 +#define MAX_PUSH_CONSTANTS_SIZE 128 +#define MAX_DYNAMIC_BUFFERS 16 +#define MAX_IMAGES 64 +#define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */ +#define MAX_INLINE_UNIFORM_BLOCK_SIZE 4096 +#define MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS 32 +/* We need 16 for UBO block reads to work and 32 for push UBOs. However, we + * use 64 here to avoid cache issues. This could most likely bring it back to + * 32 if we had different virtual addresses for the different views on a given + * GEM object. + */ +#define ANV_UBO_ALIGNMENT 64 +#define ANV_SSBO_ALIGNMENT 4 +#define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4 +#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16 +#define MAX_SAMPLE_LOCATIONS 16 + +/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model": + * + * "The surface state model is used when a Binding Table Index (specified + * in the message descriptor) of less than 240 is specified. In this model, + * the Binding Table Index is used to index into the binding table, and the + * binding table entry contains a pointer to the SURFACE_STATE." + * + * Binding table values above 240 are used for various things in the hardware + * such as stateless, stateless with incoherent cache, SLM, and bindless. + */ +#define MAX_BINDING_TABLE_SIZE 240 + +/* The kernel relocation API has a limitation of a 32-bit delta value + * applied to the address before it is written which, in spite of it being + * unsigned, is treated as signed . Because of the way that this maps to + * the Vulkan API, we cannot handle an offset into a buffer that does not + * fit into a signed 32 bits. The only mechanism we have for dealing with + * this at the moment is to limit all VkDeviceMemory objects to a maximum + * of 2GB each. The Vulkan spec allows us to do this: + * + * "Some platforms may have a limit on the maximum size of a single + * allocation. For example, certain systems may fail to create + * allocations with a size greater than or equal to 4GB. Such a limit is + * implementation-dependent, and if such a failure occurs then the error + * VK_ERROR_OUT_OF_DEVICE_MEMORY should be returned." + */ +#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 31) + +#define ANV_SVGS_VB_INDEX MAX_VBS +#define ANV_DRAWID_VB_INDEX (MAX_VBS + 1) + +/* We reserve this MI ALU register for the purpose of handling predication. + * Other code which uses the MI ALU should leave it alone. + */ +#define ANV_PREDICATE_RESULT_REG 0x2678 /* MI_ALU_REG15 */ + +/* We reserve this MI ALU register to pass around an offset computed from + * VkPerformanceQuerySubmitInfoKHR::counterPassIndex VK_KHR_performance_query. + * Other code which uses the MI ALU should leave it alone. + */ +#define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */ + +#define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1) + +/* For gfx12 we set the streamout buffers using 4 separate commands + * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout + * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of + * 3DSTATE_SO_BUFFER apart from the SOBufferIndex field, so for now we use the + * 3DSTATE_SO_BUFFER command, but change the 3DCommandSubOpcode. + * SO_BUFFER_INDEX_0_CMD is actually the 3DCommandSubOpcode for + * 3DSTATE_SO_BUFFER_INDEX_0. + */ +#define SO_BUFFER_INDEX_0_CMD 0x60 +#define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b))) + +static inline uint32_t +align_down_npot_u32(uint32_t v, uint32_t a) +{ + return v - (v % a); +} + +static inline uint32_t +align_down_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return v & ~(a - 1); +} + +static inline uint32_t +align_u32(uint32_t v, uint32_t a) +{ + assert(a != 0 && a == (a & -a)); + return align_down_u32(v + a - 1, a); +} + +static inline uint64_t +align_down_u64(uint64_t v, uint64_t a) +{ + assert(a != 0 && a == (a & -a)); + return v & ~(a - 1); +} + +static inline uint64_t +align_u64(uint64_t v, uint64_t a) +{ + return align_down_u64(v + a - 1, a); +} + +static inline int32_t +align_i32(int32_t v, int32_t a) +{ + assert(a != 0 && a == (a & -a)); + return (v + a - 1) & ~(a - 1); +} + +/** Alignment must be a power of 2. */ +static inline bool +anv_is_aligned(uintmax_t n, uintmax_t a) +{ + assert(a == (a & -a)); + return (n & (a - 1)) == 0; +} + +static inline uint32_t +anv_minify(uint32_t n, uint32_t levels) +{ + if (unlikely(n == 0)) + return 0; + else + return MAX2(n >> levels, 1); +} + +static inline float +anv_clamp_f(float f, float min, float max) +{ + assert(min < max); + + if (f > max) + return max; + else if (f < min) + return min; + else + return f; +} + +static inline bool +anv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) +{ + if (*inout_mask & clear_mask) { + *inout_mask &= ~clear_mask; + return true; + } else { + return false; + } +} + +static inline union isl_color_value +vk_to_isl_color(VkClearColorValue color) +{ + return (union isl_color_value) { + .u32 = { + color.uint32[0], + color.uint32[1], + color.uint32[2], + color.uint32[3], + }, + }; +} + +static inline union isl_color_value +vk_to_isl_color_with_format(VkClearColorValue color, enum isl_format format) +{ + const struct isl_format_layout *fmtl = isl_format_get_layout(format); + union isl_color_value isl_color = { .u32 = {0, } }; + +#define COPY_COLOR_CHANNEL(c, i) \ + if (fmtl->channels.c.bits) \ + isl_color.u32[i] = color.uint32[i] + + COPY_COLOR_CHANNEL(r, 0); + COPY_COLOR_CHANNEL(g, 1); + COPY_COLOR_CHANNEL(b, 2); + COPY_COLOR_CHANNEL(a, 3); + +#undef COPY_COLOR_CHANNEL + + return isl_color; +} + +static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags) +{ + uintptr_t mask = (1ull << bits) - 1; + *flags = ptr & mask; + return (void *) (ptr & ~mask); +} + +static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags) +{ + uintptr_t value = (uintptr_t) ptr; + uintptr_t mask = (1ull << bits) - 1; + return value | (mask & flags); +} + +/** + * Warn on ignored extension structs. + * + * The Vulkan spec requires us to ignore unsupported or unknown structs in + * a pNext chain. In debug mode, emitting warnings for ignored structs may + * help us discover structs that we should not have ignored. + * + * + * From the Vulkan 1.0.38 spec: + * + * Any component of the implementation (the loader, any enabled layers, + * and drivers) must skip over, without processing (other than reading the + * sType and pNext members) any chained structures with sType values not + * defined by extensions supported by that component. + */ +#define anv_debug_ignored_stype(sType) \ + mesa_logd("%s: ignored VkStructureType %u\n", __func__, (sType)) + +void __anv_perf_warn(struct anv_device *device, + const struct vk_object_base *object, + const char *file, int line, const char *format, ...) + anv_printflike(5, 6); + +/** + * Print a FINISHME message, including its source location. + */ +#define anv_finishme(format, ...) \ + do { \ + static bool reported = false; \ + if (!reported) { \ + mesa_logw("%s:%d: FINISHME: " format, __FILE__, __LINE__, \ + ##__VA_ARGS__); \ + reported = true; \ + } \ + } while (0) + +/** + * Print a perf warning message. Set INTEL_DEBUG=perf to see these. + */ +#define anv_perf_warn(objects_macro, format, ...) \ + do { \ + static bool reported = false; \ + if (!reported && INTEL_DEBUG(DEBUG_PERF)) { \ + __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT, \ + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, \ + objects_macro, __FILE__, __LINE__, \ + format, ## __VA_ARGS__); \ + reported = true; \ + } \ + } while (0) + +/* A non-fatal assert. Useful for debugging. */ +#ifdef DEBUG +#define anv_assert(x) ({ \ + if (unlikely(!(x))) \ + mesa_loge("%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \ +}) +#else +#define anv_assert(x) +#endif + +struct anv_bo { + const char *name; + + uint32_t gem_handle; + + uint32_t refcount; + + /* Index into the current validation list. This is used by the + * validation list building algorithm to track which buffers are already + * in the validation list so that we can ensure uniqueness. + */ + uint32_t exec_obj_index; + + /* Index for use with util_sparse_array_free_list */ + uint32_t free_index; + + /* Last known offset. This value is provided by the kernel when we + * execbuf and is used as the presumed offset for the next bunch of + * relocations. + */ + uint64_t offset; + + /** Size of the buffer not including implicit aux */ + uint64_t size; + + /* Map for internally mapped BOs. + * + * If ANV_BO_ALLOC_MAPPED is set in flags, this is the map for the whole + * BO. If ANV_BO_WRAPPER is set in flags, map points to the wrapped BO. + */ + void *map; + + /** Size of the implicit CCS range at the end of the buffer + * + * On Gfx12, CCS data is always a direct 1/256 scale-down. A single 64K + * page of main surface data maps to a 256B chunk of CCS data and that + * mapping is provided on TGL-LP by the AUX table which maps virtual memory + * addresses in the main surface to virtual memory addresses for CCS data. + * + * Because we can't change these maps around easily and because Vulkan + * allows two VkImages to be bound to overlapping memory regions (as long + * as the app is careful), it's not feasible to make this mapping part of + * the image. (On Gfx11 and earlier, the mapping was provided via + * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.) + * Instead, we attach the CCS data directly to the buffer object and setup + * the AUX table mapping at BO creation time. + * + * This field is for internal tracking use by the BO allocator only and + * should not be touched by other parts of the code. If something wants to + * know if a BO has implicit CCS data, it should instead look at the + * has_implicit_ccs boolean below. + * + * This data is not included in maps of this buffer. + */ + uint32_t _ccs_size; + + /** Flags to pass to the kernel through drm_i915_exec_object2::flags */ + uint32_t flags; + + /** True if this BO may be shared with other processes */ + bool is_external:1; + + /** True if this BO is a wrapper + * + * When set to true, none of the fields in this BO are meaningful except + * for anv_bo::is_wrapper and anv_bo::map which points to the actual BO. + * See also anv_bo_unwrap(). Wrapper BOs are not allowed when use_softpin + * is set in the physical device. + */ + bool is_wrapper:1; + + /** See also ANV_BO_ALLOC_FIXED_ADDRESS */ + bool has_fixed_address:1; + + /** True if this BO wraps a host pointer */ + bool from_host_ptr:1; + + /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */ + bool has_client_visible_address:1; + + /** True if this BO has implicit CCS data attached to it */ + bool has_implicit_ccs:1; +}; + +static inline struct anv_bo * +anv_bo_ref(struct anv_bo *bo) +{ + p_atomic_inc(&bo->refcount); + return bo; +} + +static inline struct anv_bo * +anv_bo_unwrap(struct anv_bo *bo) +{ + while (bo->is_wrapper) + bo = bo->map; + return bo; +} + +static inline bool +anv_bo_is_pinned(struct anv_bo *bo) +{ +#if defined(GFX_VERx10) && GFX_VERx10 >= 90 + /* Sky Lake and later always uses softpin */ + assert(bo->flags & EXEC_OBJECT_PINNED); + return true; +#elif defined(GFX_VERx10) && GFX_VERx10 < 80 + /* Haswell and earlier never use softpin */ + assert(!(bo->flags & EXEC_OBJECT_PINNED)); + assert(!bo->has_fixed_address); + return false; +#else + /* If we don't have a GFX_VERx10 #define, we need to look at the BO. Also, + * for GFX version 8, we need to look at the BO because Broadwell softpins + * but Cherryview doesn't. + */ + assert((bo->flags & EXEC_OBJECT_PINNED) || !bo->has_fixed_address); + return (bo->flags & EXEC_OBJECT_PINNED) != 0; +#endif +} + +struct anv_address { + struct anv_bo *bo; + int64_t offset; +}; + +#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 }) + +static inline struct anv_address +anv_address_from_u64(uint64_t addr_u64) +{ + assert(addr_u64 == intel_canonical_address(addr_u64)); + return (struct anv_address) { + .bo = NULL, + .offset = addr_u64, + }; +} + +static inline bool +anv_address_is_null(struct anv_address addr) +{ + return addr.bo == NULL && addr.offset == 0; +} + +static inline uint64_t +anv_address_physical(struct anv_address addr) +{ + if (addr.bo && anv_bo_is_pinned(addr.bo)) { + return intel_canonical_address(addr.bo->offset + addr.offset); + } else { + return intel_canonical_address(addr.offset); + } +} + +static inline struct anv_address +anv_address_add(struct anv_address addr, uint64_t offset) +{ + addr.offset += offset; + return addr; +} + +/* Represents a lock-free linked list of "free" things. This is used by + * both the block pool and the state pools. Unfortunately, in order to + * solve the ABA problem, we can't use a single uint32_t head. + */ +union anv_free_list { + struct { + uint32_t offset; + + /* A simple count that is incremented every time the head changes. */ + uint32_t count; + }; + /* Make sure it's aligned to 64 bits. This will make atomic operations + * faster on 32 bit platforms. + */ + uint64_t u64 __attribute__ ((aligned (8))); +}; + +#define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { UINT32_MAX, 0 } }) + +struct anv_block_state { + union { + struct { + uint32_t next; + uint32_t end; + }; + /* Make sure it's aligned to 64 bits. This will make atomic operations + * faster on 32 bit platforms. + */ + uint64_t u64 __attribute__ ((aligned (8))); + }; +}; + +#define anv_block_pool_foreach_bo(bo, pool) \ + for (struct anv_bo **_pp_bo = (pool)->bos, *bo; \ + _pp_bo != &(pool)->bos[(pool)->nbos] && (bo = *_pp_bo, true); \ + _pp_bo++) + +#define ANV_MAX_BLOCK_POOL_BOS 20 + +struct anv_block_pool { + const char *name; + + struct anv_device *device; + bool use_relocations; + + /* Wrapper BO for use in relocation lists. This BO is simply a wrapper + * around the actual BO so that we grow the pool after the wrapper BO has + * been put in a relocation list. This is only used in the non-softpin + * case. + */ + struct anv_bo wrapper_bo; + + struct anv_bo *bos[ANV_MAX_BLOCK_POOL_BOS]; + struct anv_bo *bo; + uint32_t nbos; + + uint64_t size; + + /* The address where the start of the pool is pinned. The various bos that + * are created as the pool grows will have addresses in the range + * [start_address, start_address + BLOCK_POOL_MEMFD_SIZE). + */ + uint64_t start_address; + + /* The offset from the start of the bo to the "center" of the block + * pool. Pointers to allocated blocks are given by + * bo.map + center_bo_offset + offsets. + */ + uint32_t center_bo_offset; + + /* Current memory map of the block pool. This pointer may or may not + * point to the actual beginning of the block pool memory. If + * anv_block_pool_alloc_back has ever been called, then this pointer + * will point to the "center" position of the buffer and all offsets + * (negative or positive) given out by the block pool alloc functions + * will be valid relative to this pointer. + * + * In particular, map == bo.map + center_offset + * + * DO NOT access this pointer directly. Use anv_block_pool_map() instead, + * since it will handle the softpin case as well, where this points to NULL. + */ + void *map; + int fd; + + /** + * Array of mmaps and gem handles owned by the block pool, reclaimed when + * the block pool is destroyed. + */ + struct u_vector mmap_cleanups; + + struct anv_block_state state; + + struct anv_block_state back_state; +}; + +/* Block pools are backed by a fixed-size 1GB memfd */ +#define BLOCK_POOL_MEMFD_SIZE (1ul << 30) + +/* The center of the block pool is also the middle of the memfd. This may + * change in the future if we decide differently for some reason. + */ +#define BLOCK_POOL_MEMFD_CENTER (BLOCK_POOL_MEMFD_SIZE / 2) + +static inline uint32_t +anv_block_pool_size(struct anv_block_pool *pool) +{ + return pool->state.end + pool->back_state.end; +} + +struct anv_state { + int32_t offset; + uint32_t alloc_size; + void *map; + uint32_t idx; +}; + +#define ANV_STATE_NULL ((struct anv_state) { .alloc_size = 0 }) + +struct anv_fixed_size_state_pool { + union anv_free_list free_list; + struct anv_block_state block; +}; + +#define ANV_MIN_STATE_SIZE_LOG2 6 +#define ANV_MAX_STATE_SIZE_LOG2 21 + +#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1) + +struct anv_free_entry { + uint32_t next; + struct anv_state state; +}; + +struct anv_state_table { + struct anv_device *device; + int fd; + struct anv_free_entry *map; + uint32_t size; + struct anv_block_state state; + struct u_vector cleanups; +}; + +struct anv_state_pool { + struct anv_block_pool block_pool; + + /* Offset into the relevant state base address where the state pool starts + * allocating memory. + */ + int32_t start_offset; + + struct anv_state_table table; + + /* The size of blocks which will be allocated from the block pool */ + uint32_t block_size; + + /** Free list for "back" allocations */ + union anv_free_list back_alloc_free_list; + + struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS]; +}; + +struct anv_state_reserved_pool { + struct anv_state_pool *pool; + union anv_free_list reserved_blocks; + uint32_t count; +}; + +struct anv_state_stream { + struct anv_state_pool *state_pool; + + /* The size of blocks to allocate from the state pool */ + uint32_t block_size; + + /* Current block we're allocating from */ + struct anv_state block; + + /* Offset into the current block at which to allocate the next state */ + uint32_t next; + + /* List of all blocks allocated from this pool */ + struct util_dynarray all_blocks; +}; + +/* The block_pool functions exported for testing only. The block pool should + * only be used via a state pool (see below). + */ +VkResult anv_block_pool_init(struct anv_block_pool *pool, + struct anv_device *device, + const char *name, + uint64_t start_address, + uint32_t initial_size); +void anv_block_pool_finish(struct anv_block_pool *pool); +int32_t anv_block_pool_alloc(struct anv_block_pool *pool, + uint32_t block_size, uint32_t *padding); +int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool, + uint32_t block_size); +void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t +size); + +VkResult anv_state_pool_init(struct anv_state_pool *pool, + struct anv_device *device, + const char *name, + uint64_t base_address, + int32_t start_offset, + uint32_t block_size); +void anv_state_pool_finish(struct anv_state_pool *pool); +struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool, + uint32_t state_size, uint32_t alignment); +struct anv_state anv_state_pool_alloc_back(struct anv_state_pool *pool); +void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state); +void anv_state_stream_init(struct anv_state_stream *stream, + struct anv_state_pool *state_pool, + uint32_t block_size); +void anv_state_stream_finish(struct anv_state_stream *stream); +struct anv_state anv_state_stream_alloc(struct anv_state_stream *stream, + uint32_t size, uint32_t alignment); + +void anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool, + struct anv_state_pool *parent, + uint32_t count, uint32_t size, + uint32_t alignment); +void anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool); +struct anv_state anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool); +void anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool, + struct anv_state state); + +VkResult anv_state_table_init(struct anv_state_table *table, + struct anv_device *device, + uint32_t initial_entries); +void anv_state_table_finish(struct anv_state_table *table); +VkResult anv_state_table_add(struct anv_state_table *table, uint32_t *idx, + uint32_t count); +void anv_free_list_push(union anv_free_list *list, + struct anv_state_table *table, + uint32_t idx, uint32_t count); +struct anv_state* anv_free_list_pop(union anv_free_list *list, + struct anv_state_table *table); + + +static inline struct anv_state * +anv_state_table_get(struct anv_state_table *table, uint32_t idx) +{ + return &table->map[idx].state; +} +/** + * Implements a pool of re-usable BOs. The interface is identical to that + * of block_pool except that each block is its own BO. + */ +struct anv_bo_pool { + const char *name; + + struct anv_device *device; + + struct util_sparse_array_free_list free_list[16]; +}; + +void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device, + const char *name); +void anv_bo_pool_finish(struct anv_bo_pool *pool); +VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size, + struct anv_bo **bo_out); +void anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo); + +struct anv_scratch_pool { + /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */ + struct anv_bo *bos[16][MESA_SHADER_STAGES]; + uint32_t surfs[16]; + struct anv_state surf_states[16]; +}; + +void anv_scratch_pool_init(struct anv_device *device, + struct anv_scratch_pool *pool); +void anv_scratch_pool_finish(struct anv_device *device, + struct anv_scratch_pool *pool); +struct anv_bo *anv_scratch_pool_alloc(struct anv_device *device, + struct anv_scratch_pool *pool, + gl_shader_stage stage, + unsigned per_thread_scratch); +uint32_t anv_scratch_pool_get_surf(struct anv_device *device, + struct anv_scratch_pool *pool, + unsigned per_thread_scratch); + +/** Implements a BO cache that ensures a 1-1 mapping of GEM BOs to anv_bos */ +struct anv_bo_cache { + struct util_sparse_array bo_map; + pthread_mutex_t mutex; +}; + +VkResult anv_bo_cache_init(struct anv_bo_cache *cache, + struct anv_device *device); +void anv_bo_cache_finish(struct anv_bo_cache *cache); + +struct anv_queue_family { + /* Standard bits passed on to the client */ + VkQueueFlags queueFlags; + uint32_t queueCount; + + /* Driver internal information */ + enum drm_i915_gem_engine_class engine_class; +}; + +#define ANV_MAX_QUEUE_FAMILIES 3 + +struct anv_memory_type { + /* Standard bits passed on to the client */ + VkMemoryPropertyFlags propertyFlags; + uint32_t heapIndex; +}; + +struct anv_memory_heap { + /* Standard bits passed on to the client */ + VkDeviceSize size; + VkMemoryHeapFlags flags; + + /** Driver-internal book-keeping. + * + * Align it to 64 bits to make atomic operations faster on 32 bit platforms. + */ + VkDeviceSize used __attribute__ ((aligned (8))); + + bool is_local_mem; +}; + +struct anv_memregion { + struct drm_i915_gem_memory_class_instance region; + uint64_t size; + uint64_t available; +}; + +struct anv_physical_device { + struct vk_physical_device vk; + + /* Link in anv_instance::physical_devices */ + struct list_head link; + + struct anv_instance * instance; + char path[20]; + struct intel_device_info info; + /** Amount of "GPU memory" we want to advertise + * + * Clearly, this value is bogus since Intel is a UMA architecture. On + * gfx7 platforms, we are limited by GTT size unless we want to implement + * fine-grained tracking and GTT splitting. On Broadwell and above we are + * practically unlimited. However, we will never report more than 3/4 of + * the total system ram to try and avoid running out of RAM. + */ + bool supports_48bit_addresses; + struct brw_compiler * compiler; + struct isl_device isl_dev; + struct intel_perf_config * perf; + /* True if hardware support is incomplete/alpha */ + bool is_alpha; + /* + * Number of commands required to implement a performance query begin + + * end. + */ + uint32_t n_perf_query_commands; + int cmd_parser_version; + bool has_exec_async; + bool has_exec_capture; + int max_context_priority; + bool has_context_isolation; + bool has_mmap_offset; + bool has_userptr_probe; + uint64_t gtt_size; + + bool use_relocations; + bool use_softpin; + bool always_use_bindless; + bool use_call_secondary; + + /** True if we can access buffers using A64 messages */ + bool has_a64_buffer_access; + /** True if we can use bindless access for images */ + bool has_bindless_images; + /** True if we can use bindless access for samplers */ + bool has_bindless_samplers; + /** True if we can use timeline semaphores through execbuf */ + bool has_exec_timeline; + + /** True if we can read the GPU timestamp register + * + * When running in a virtual context, the timestamp register is unreadable + * on Gfx12+. + */ + bool has_reg_timestamp; + + /** True if this device has implicit AUX + * + * If true, CCS is handled as an implicit attachment to the BO rather than + * as an explicitly bound surface. + */ + bool has_implicit_ccs; + + bool always_flush_cache; + + struct { + uint32_t family_count; + struct anv_queue_family families[ANV_MAX_QUEUE_FAMILIES]; + } queue; + + struct { + uint32_t type_count; + struct anv_memory_type types[VK_MAX_MEMORY_TYPES]; + uint32_t heap_count; + struct anv_memory_heap heaps[VK_MAX_MEMORY_HEAPS]; + bool need_clflush; + } memory; + + /* Either we have a single vram region and it's all mappable, or we have + * both mappable & non-mappable parts. System memory is always available. + */ + struct anv_memregion vram_mappable; + struct anv_memregion vram_non_mappable; + struct anv_memregion sys; + uint8_t driver_build_sha1[20]; + uint8_t pipeline_cache_uuid[VK_UUID_SIZE]; + uint8_t driver_uuid[VK_UUID_SIZE]; + uint8_t device_uuid[VK_UUID_SIZE]; + + struct vk_sync_type sync_syncobj_type; + struct vk_sync_timeline_type sync_timeline_type; + const struct vk_sync_type * sync_types[4]; + + struct wsi_device wsi_device; + int local_fd; + bool has_local; + int64_t local_major; + int64_t local_minor; + int master_fd; + bool has_master; + int64_t master_major; + int64_t master_minor; + struct drm_i915_query_engine_info * engine_info; + + void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, bool); + struct intel_measure_device measure_device; +}; + +static inline bool +anv_physical_device_has_vram(const struct anv_physical_device *device) +{ + return device->vram_mappable.size > 0; +} + +struct anv_instance { + struct vk_instance vk; + + struct driOptionCache dri_options; + struct driOptionCache available_dri_options; + + /** + * Workarounds for game bugs. + */ + bool assume_full_subgroups; + bool limit_trig_input_range; + bool sample_mask_out_opengl_behaviour; +}; + +VkResult anv_init_wsi(struct anv_physical_device *physical_device); +void anv_finish_wsi(struct anv_physical_device *physical_device); + +struct anv_queue { + struct vk_queue vk; + + struct anv_device * device; + + const struct anv_queue_family * family; + + uint32_t index_in_family; + + uint32_t exec_flags; + + /** Synchronization object for debug purposes (DEBUG_SYNC) */ + struct vk_sync *sync; + + struct intel_ds_queue * ds; +}; + +struct nir_xfb_info; +struct anv_pipeline_bind_map; + +extern const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2]; + +struct anv_shader_bin * +anv_device_search_for_kernel(struct anv_device *device, + struct vk_pipeline_cache *cache, + const void *key_data, uint32_t key_size, + bool *user_cache_bit); + +struct anv_shader_bin * +anv_device_upload_kernel(struct anv_device *device, + struct vk_pipeline_cache *cache, + gl_shader_stage stage, + const void *key_data, uint32_t key_size, + const void *kernel_data, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + const struct brw_compile_stats *stats, + uint32_t num_stats, + const struct nir_xfb_info *xfb_info, + const struct anv_pipeline_bind_map *bind_map); + +struct nir_shader; +struct nir_shader_compiler_options; + +struct nir_shader * +anv_device_search_for_nir(struct anv_device *device, + struct vk_pipeline_cache *cache, + const struct nir_shader_compiler_options *nir_options, + unsigned char sha1_key[20], + void *mem_ctx); + +void +anv_device_upload_nir(struct anv_device *device, + struct vk_pipeline_cache *cache, + const struct nir_shader *nir, + unsigned char sha1_key[20]); + +struct anv_device { + struct vk_device vk; + + struct anv_physical_device * physical; + const struct intel_device_info * info; + struct isl_device isl_dev; + int context_id; + int fd; + bool can_chain_batches; + bool robust_buffer_access; + + pthread_mutex_t vma_mutex; + struct util_vma_heap vma_lo; + struct util_vma_heap vma_cva; + struct util_vma_heap vma_hi; + + /** List of all anv_device_memory objects */ + struct list_head memory_objects; + + struct anv_bo_pool batch_bo_pool; + struct anv_bo_pool utrace_bo_pool; + + struct anv_bo_cache bo_cache; + + struct anv_state_pool general_state_pool; + struct anv_state_pool dynamic_state_pool; + struct anv_state_pool instruction_state_pool; + struct anv_state_pool binding_table_pool; + struct anv_state_pool surface_state_pool; + + struct anv_state_reserved_pool custom_border_colors; + + /** BO used for various workarounds + * + * There are a number of workarounds on our hardware which require writing + * data somewhere and it doesn't really matter where. For that, we use + * this BO and just write to the first dword or so. + * + * We also need to be able to handle NULL buffers bound as pushed UBOs. + * For that, we use the high bytes (>= 1024) of the workaround BO. + */ + struct anv_bo * workaround_bo; + struct anv_address workaround_address; + + struct anv_bo * trivial_batch_bo; + struct anv_state null_surface_state; + + struct vk_pipeline_cache * default_pipeline_cache; + struct vk_pipeline_cache * internal_cache; + struct blorp_context blorp; + + struct anv_state border_colors; + + struct anv_state slice_hash; + + /** An array of CPS_STATE structures grouped by MAX_VIEWPORTS elements + * + * We need to emit CPS_STATE structures for each viewport accessible by a + * pipeline. So rather than write many identical CPS_STATE structures + * dynamically, we can enumerate all possible combinaisons and then just + * emit a 3DSTATE_CPS_POINTERS instruction with the right offset into this + * array. + */ + struct anv_state cps_states; + + uint32_t queue_count; + struct anv_queue * queues; + + struct anv_scratch_pool scratch_pool; + struct anv_bo *rt_scratch_bos[16]; + + /** Shadow ray query BO + * + * The ray_query_bo only holds the current ray being traced. When using + * more than 1 ray query per thread, we cannot fit all the queries in + * there, so we need a another buffer to hold query data that is not + * currently being used by the HW for tracing, similar to a scratch space. + * + * The size of the shadow buffer depends on the number of queries per + * shader. + */ + struct anv_bo *ray_query_shadow_bos[16]; + /** Ray query buffer used to communicated with HW unit. + */ + struct anv_bo *ray_query_bo; + + struct anv_shader_bin *rt_trampoline; + struct anv_shader_bin *rt_trivial_return; + + pthread_mutex_t mutex; + pthread_cond_t queue_submit; + + struct intel_batch_decode_ctx decoder_ctx; + /* + * When decoding a anv_cmd_buffer, we might need to search for BOs through + * the cmd_buffer's list. + */ + struct anv_cmd_buffer *cmd_buffer_being_decoded; + + int perf_fd; /* -1 if no opened */ + uint64_t perf_metric; /* 0 if unset */ + + struct intel_aux_map_context *aux_map_ctx; + + const struct intel_l3_config *l3_config; + + struct intel_debug_block_frame *debug_frame_desc; + + struct intel_ds_device ds; +}; + +#if defined(GFX_VERx10) && GFX_VERx10 >= 90 +#define ANV_ALWAYS_SOFTPIN true +#else +#define ANV_ALWAYS_SOFTPIN false +#endif + +static inline bool +anv_use_relocations(const struct anv_physical_device *pdevice) +{ +#if defined(GFX_VERx10) && GFX_VERx10 >= 90 + /* Sky Lake and later always uses softpin */ + assert(!pdevice->use_relocations); + return false; +#elif defined(GFX_VERx10) && GFX_VERx10 < 80 + /* Haswell and earlier never use softpin */ + assert(pdevice->use_relocations); + return true; +#else + /* If we don't have a GFX_VERx10 #define, we need to look at the physical + * device. Also, for GFX version 8, we need to look at the physical + * device because Broadwell softpins but Cherryview doesn't. + */ + return pdevice->use_relocations; +#endif +} + +static inline struct anv_state_pool * +anv_binding_table_pool(struct anv_device *device) +{ + if (anv_use_relocations(device->physical)) + return &device->surface_state_pool; + else + return &device->binding_table_pool; +} + +static inline struct anv_state +anv_binding_table_pool_alloc(struct anv_device *device) +{ + if (anv_use_relocations(device->physical)) + return anv_state_pool_alloc_back(&device->surface_state_pool); + else + return anv_state_pool_alloc(&device->binding_table_pool, + device->binding_table_pool.block_size, 0); +} + +static inline void +anv_binding_table_pool_free(struct anv_device *device, struct anv_state state) { + anv_state_pool_free(anv_binding_table_pool(device), state); +} + +static inline uint32_t +anv_mocs(const struct anv_device *device, + const struct anv_bo *bo, + isl_surf_usage_flags_t usage) +{ + return isl_mocs(&device->isl_dev, usage, bo && bo->is_external); +} + +void anv_device_init_blorp(struct anv_device *device); +void anv_device_finish_blorp(struct anv_device *device); + +enum anv_bo_alloc_flags { + /** Specifies that the BO must have a 32-bit address + * + * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS. + */ + ANV_BO_ALLOC_32BIT_ADDRESS = (1 << 0), + + /** Specifies that the BO may be shared externally */ + ANV_BO_ALLOC_EXTERNAL = (1 << 1), + + /** Specifies that the BO should be mapped */ + ANV_BO_ALLOC_MAPPED = (1 << 2), + + /** Specifies that the BO should be snooped so we get coherency */ + ANV_BO_ALLOC_SNOOPED = (1 << 3), + + /** Specifies that the BO should be captured in error states */ + ANV_BO_ALLOC_CAPTURE = (1 << 4), + + /** Specifies that the BO will have an address assigned by the caller + * + * Such BOs do not exist in any VMA heap. + */ + ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5), + + /** Enables implicit synchronization on the BO + * + * This is the opposite of EXEC_OBJECT_ASYNC. + */ + ANV_BO_ALLOC_IMPLICIT_SYNC = (1 << 6), + + /** Enables implicit synchronization on the BO + * + * This is equivalent to EXEC_OBJECT_WRITE. + */ + ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7), + + /** Has an address which is visible to the client */ + ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8), + + /** This buffer has implicit CCS data attached to it */ + ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9), + + /** This buffer is allocated from local memory and should be cpu visible */ + ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE = (1 << 10), +}; + +VkResult anv_device_alloc_bo(struct anv_device *device, + const char *name, uint64_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t explicit_address, + struct anv_bo **bo); +VkResult anv_device_map_bo(struct anv_device *device, + struct anv_bo *bo, + uint64_t offset, + size_t size, + uint32_t gem_flags, + void **map_out); +void anv_device_unmap_bo(struct anv_device *device, + struct anv_bo *bo, + void *map, size_t map_size); +VkResult anv_device_import_bo_from_host_ptr(struct anv_device *device, + void *host_ptr, uint32_t size, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo_out); +VkResult anv_device_import_bo(struct anv_device *device, int fd, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address, + struct anv_bo **bo); +VkResult anv_device_export_bo(struct anv_device *device, + struct anv_bo *bo, int *fd_out); +VkResult anv_device_get_bo_tiling(struct anv_device *device, + struct anv_bo *bo, + enum isl_tiling *tiling_out); +VkResult anv_device_set_bo_tiling(struct anv_device *device, + struct anv_bo *bo, + uint32_t row_pitch_B, + enum isl_tiling tiling); +void anv_device_release_bo(struct anv_device *device, + struct anv_bo *bo); + +static inline void anv_device_set_physical(struct anv_device *device, + struct anv_physical_device *physical_device) +{ + device->physical = physical_device; + device->info = &physical_device->info; + device->isl_dev = physical_device->isl_dev; +} + +static inline struct anv_bo * +anv_device_lookup_bo(struct anv_device *device, uint32_t gem_handle) +{ + return util_sparse_array_get(&device->bo_cache.bo_map, gem_handle); +} + +VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo, + int64_t timeout); + +VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue, + uint32_t exec_flags, + const VkDeviceQueueCreateInfo *pCreateInfo, + uint32_t index_in_family); +void anv_queue_finish(struct anv_queue *queue); + +VkResult anv_queue_submit(struct vk_queue *queue, + struct vk_queue_submit *submit); +VkResult anv_queue_submit_simple_batch(struct anv_queue *queue, + struct anv_batch *batch); + +void* anv_gem_mmap(struct anv_device *device, + uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags); +void anv_gem_munmap(struct anv_device *device, void *p, uint64_t size); +uint32_t anv_gem_create(struct anv_device *device, uint64_t size); +void anv_gem_close(struct anv_device *device, uint32_t gem_handle); +uint32_t anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size, + uint32_t flags, uint32_t num_regions, + struct drm_i915_gem_memory_class_instance *regions); +uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size); +int anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns); +int anv_gem_execbuffer(struct anv_device *device, + struct drm_i915_gem_execbuffer2 *execbuf); +int anv_gem_set_tiling(struct anv_device *device, uint32_t gem_handle, + uint32_t stride, uint32_t tiling); +int anv_gem_create_context(struct anv_device *device); +bool anv_gem_has_context_priority(int fd, int priority); +int anv_gem_destroy_context(struct anv_device *device, int context); +int anv_gem_set_context_param(int fd, int context, uint32_t param, + uint64_t value); +int anv_gem_get_param(int fd, uint32_t param); +int anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle); +int anv_gem_context_get_reset_stats(int fd, int context, + uint32_t *active, uint32_t *pending); +int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle); +int anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result); +uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd); +int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching); +int anv_i915_query(int fd, uint64_t query_id, void *buffer, + int32_t *buffer_len); +struct drm_i915_query_engine_info *anv_gem_get_engine_info(int fd); + +uint64_t anv_vma_alloc(struct anv_device *device, + uint64_t size, uint64_t align, + enum anv_bo_alloc_flags alloc_flags, + uint64_t client_address); +void anv_vma_free(struct anv_device *device, + uint64_t address, uint64_t size); + +struct anv_reloc_list { + uint32_t num_relocs; + uint32_t array_length; + struct drm_i915_gem_relocation_entry * relocs; + struct anv_bo ** reloc_bos; + uint32_t dep_words; + BITSET_WORD * deps; +}; + +VkResult anv_reloc_list_init(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc); +void anv_reloc_list_finish(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc); + +VkResult anv_reloc_list_add(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + uint32_t offset, struct anv_bo *target_bo, + uint32_t delta, uint64_t *address_u64_out); + +VkResult anv_reloc_list_add_bo(struct anv_reloc_list *list, + const VkAllocationCallbacks *alloc, + struct anv_bo *target_bo); + +struct anv_batch_bo { + /* Link in the anv_cmd_buffer.owned_batch_bos list */ + struct list_head link; + + struct anv_bo * bo; + + /* Bytes actually consumed in this batch BO */ + uint32_t length; + + /* When this batch BO is used as part of a primary batch buffer, this + * tracked whether it is chained to another primary batch buffer. + * + * If this is the case, the relocation list's last entry points the + * location of the MI_BATCH_BUFFER_START chaining to the next batch. + */ + bool chained; + + struct anv_reloc_list relocs; +}; + +struct anv_batch { + const VkAllocationCallbacks * alloc; + + struct anv_address start_addr; + + void * start; + void * end; + void * next; + + struct anv_reloc_list * relocs; + + /* This callback is called (with the associated user data) in the event + * that the batch runs out of space. + */ + VkResult (*extend_cb)(struct anv_batch *, void *); + void * user_data; + + /** + * Current error status of the command buffer. Used to track inconsistent + * or incomplete command buffer states that are the consequence of run-time + * errors such as out of memory scenarios. We want to track this in the + * batch because the command buffer object is not visible to some parts + * of the driver. + */ + VkResult status; +}; + +void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords); +void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other); +struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location); + +static inline void +anv_batch_set_storage(struct anv_batch *batch, struct anv_address addr, + void *map, size_t size) +{ + batch->start_addr = addr; + batch->next = batch->start = map; + batch->end = map + size; +} + +static inline VkResult +anv_batch_set_error(struct anv_batch *batch, VkResult error) +{ + assert(error != VK_SUCCESS); + if (batch->status == VK_SUCCESS) + batch->status = error; + return batch->status; +} + +static inline bool +anv_batch_has_error(struct anv_batch *batch) +{ + return batch->status != VK_SUCCESS; +} + +static inline uint64_t +anv_batch_emit_reloc(struct anv_batch *batch, + void *location, struct anv_bo *bo, uint32_t delta) +{ + uint64_t address_u64 = 0; + VkResult result; + + if (ANV_ALWAYS_SOFTPIN) { + address_u64 = bo->offset + delta; + result = anv_reloc_list_add_bo(batch->relocs, batch->alloc, bo); + } else { + result = anv_reloc_list_add(batch->relocs, batch->alloc, + location - batch->start, bo, delta, + &address_u64); + } + if (unlikely(result != VK_SUCCESS)) { + anv_batch_set_error(batch, result); + return 0; + } + + return address_u64; +} + +static inline void +write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush) +{ + unsigned reloc_size = 0; + if (device->info->ver >= 8) { + reloc_size = sizeof(uint64_t); + *(uint64_t *)p = intel_canonical_address(v); + } else { + reloc_size = sizeof(uint32_t); + *(uint32_t *)p = v; + } + + if (flush && device->physical->memory.need_clflush) + intel_flush_range(p, reloc_size); +} + +static inline uint64_t +_anv_combine_address(struct anv_batch *batch, void *location, + const struct anv_address address, uint32_t delta) +{ + if (address.bo == NULL) { + return address.offset + delta; + } else if (batch == NULL) { + assert(anv_bo_is_pinned(address.bo)); + return anv_address_physical(anv_address_add(address, delta)); + } else { + assert(batch->start <= location && location < batch->end); + /* i915 relocations are signed. */ + assert(INT32_MIN <= address.offset && address.offset <= INT32_MAX); + return anv_batch_emit_reloc(batch, location, address.bo, address.offset + delta); + } +} + +#define __gen_address_type struct anv_address +#define __gen_user_data struct anv_batch +#define __gen_combine_address _anv_combine_address + +/* Wrapper macros needed to work around preprocessor argument issues. In + * particular, arguments don't get pre-evaluated if they are concatenated. + * This means that, if you pass GENX(3DSTATE_PS) into the emit macro, the + * GENX macro won't get evaluated if the emit macro contains "cmd ## foo". + * We can work around this easily enough with these helpers. + */ +#define __anv_cmd_length(cmd) cmd ## _length +#define __anv_cmd_length_bias(cmd) cmd ## _length_bias +#define __anv_cmd_header(cmd) cmd ## _header +#define __anv_cmd_pack(cmd) cmd ## _pack +#define __anv_reg_num(reg) reg ## _num + +#define anv_pack_struct(dst, struc, ...) do { \ + struct struc __template = { \ + __VA_ARGS__ \ + }; \ + __anv_cmd_pack(struc)(NULL, dst, &__template); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(dst, __anv_cmd_length(struc) * 4)); \ + } while (0) + +#define anv_batch_emitn(batch, n, cmd, ...) ({ \ + void *__dst = anv_batch_emit_dwords(batch, n); \ + if (__dst) { \ + struct cmd __template = { \ + __anv_cmd_header(cmd), \ + .DWordLength = n - __anv_cmd_length_bias(cmd), \ + __VA_ARGS__ \ + }; \ + __anv_cmd_pack(cmd)(batch, __dst, &__template); \ + } \ + __dst; \ + }) + +#define anv_batch_emit_merge(batch, dwords0, dwords1) \ + do { \ + uint32_t *dw; \ + \ + STATIC_ASSERT(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1)); \ + dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0)); \ + if (!dw) \ + break; \ + for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++) \ + dw[i] = (dwords0)[i] | (dwords1)[i]; \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));\ + } while (0) + +#define anv_batch_emit(batch, cmd, name) \ + for (struct cmd name = { __anv_cmd_header(cmd) }, \ + *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd)); \ + __builtin_expect(_dst != NULL, 1); \ + ({ __anv_cmd_pack(cmd)(batch, _dst, &name); \ + VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \ + _dst = NULL; \ + })) + +#define anv_batch_write_reg(batch, reg, name) \ + for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL; \ + ({ \ + uint32_t _dw[__anv_cmd_length(reg)]; \ + __anv_cmd_pack(reg)(NULL, _dw, &name); \ + for (unsigned i = 0; i < __anv_cmd_length(reg); i++) { \ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \ + lri.RegisterOffset = __anv_reg_num(reg); \ + lri.DataDWord = _dw[i]; \ + } \ + } \ + _cont = NULL; \ + })) + +/* #define __gen_get_batch_dwords anv_batch_emit_dwords */ +/* #define __gen_get_batch_address anv_batch_address */ +/* #define __gen_address_value anv_address_physical */ +/* #define __gen_address_offset anv_address_add */ + +struct anv_device_memory { + struct vk_object_base base; + + struct list_head link; + + struct anv_bo * bo; + const struct anv_memory_type * type; + + void * map; + size_t map_size; + + /* The map, from the user PoV is map + map_delta */ + uint64_t map_delta; + + /* If set, we are holding reference to AHardwareBuffer + * which we must release when memory is freed. + */ + struct AHardwareBuffer * ahw; + + /* If set, this memory comes from a host pointer. */ + void * host_ptr; +}; + +/** + * Header for Vertex URB Entry (VUE) + */ +struct anv_vue_header { + uint32_t Reserved; + uint32_t RTAIndex; /* RenderTargetArrayIndex */ + uint32_t ViewportIndex; + float PointWidth; +}; + +/** Struct representing a sampled image descriptor + * + * This descriptor layout is used for sampled images, bare sampler, and + * combined image/sampler descriptors. + */ +struct anv_sampled_image_descriptor { + /** Bindless image handle + * + * This is expected to already be shifted such that the 20-bit + * SURFACE_STATE table index is in the top 20 bits. + */ + uint32_t image; + + /** Bindless sampler handle + * + * This is assumed to be a 32B-aligned SAMPLER_STATE pointer relative + * to the dynamic state base address. + */ + uint32_t sampler; +}; + +struct anv_texture_swizzle_descriptor { + /** Texture swizzle + * + * See also nir_intrinsic_channel_select_intel + */ + uint8_t swizzle[4]; + + /** Unused padding to ensure the struct is a multiple of 64 bits */ + uint32_t _pad; +}; + +/** Struct representing a storage image descriptor */ +struct anv_storage_image_descriptor { + /** Bindless image handles + * + * These are expected to already be shifted such that the 20-bit + * SURFACE_STATE table index is in the top 20 bits. + */ + uint32_t vanilla; + uint32_t lowered; +}; + +/** Struct representing a address/range descriptor + * + * The fields of this struct correspond directly to the data layout of + * nir_address_format_64bit_bounded_global addresses. The last field is the + * offset in the NIR address so it must be zero so that when you load the + * descriptor you get a pointer to the start of the range. + */ +struct anv_address_range_descriptor { + uint64_t address; + uint32_t range; + uint32_t zero; +}; + +enum anv_descriptor_data { + /** The descriptor contains a BTI reference to a surface state */ + ANV_DESCRIPTOR_SURFACE_STATE = (1 << 0), + /** The descriptor contains a BTI reference to a sampler state */ + ANV_DESCRIPTOR_SAMPLER_STATE = (1 << 1), + /** The descriptor contains an actual buffer view */ + ANV_DESCRIPTOR_BUFFER_VIEW = (1 << 2), + /** The descriptor contains auxiliary image layout data */ + ANV_DESCRIPTOR_IMAGE_PARAM = (1 << 3), + /** The descriptor contains auxiliary image layout data */ + ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4), + /** anv_address_range_descriptor with a buffer address and range */ + ANV_DESCRIPTOR_ADDRESS_RANGE = (1 << 5), + /** Bindless surface handle */ + ANV_DESCRIPTOR_SAMPLED_IMAGE = (1 << 6), + /** Storage image handles */ + ANV_DESCRIPTOR_STORAGE_IMAGE = (1 << 7), + /** Storage image handles */ + ANV_DESCRIPTOR_TEXTURE_SWIZZLE = (1 << 8), +}; + +struct anv_descriptor_set_binding_layout { + /* The type of the descriptors in this binding */ + VkDescriptorType type; + + /* Flags provided when this binding was created */ + VkDescriptorBindingFlags flags; + + /* Bitfield representing the type of data this descriptor contains */ + enum anv_descriptor_data data; + + /* Maximum number of YCbCr texture/sampler planes */ + uint8_t max_plane_count; + + /* Number of array elements in this binding (or size in bytes for inline + * uniform data) + */ + uint32_t array_size; + + /* Index into the flattened descriptor set */ + uint32_t descriptor_index; + + /* Index into the dynamic state array for a dynamic buffer */ + int16_t dynamic_offset_index; + + /* Index into the descriptor set buffer views */ + int32_t buffer_view_index; + + /* Offset into the descriptor buffer where this descriptor lives */ + uint32_t descriptor_offset; + + /* Pre computed stride */ + unsigned descriptor_stride; + + /* Immutable samplers (or NULL if no immutable samplers) */ + struct anv_sampler **immutable_samplers; +}; + +bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice, + const struct anv_descriptor_set_binding_layout *binding, + bool sampler); + +bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice, + const struct anv_descriptor_set_binding_layout *binding, + bool sampler); + +struct anv_descriptor_set_layout { + struct vk_object_base base; + + /* Descriptor set layouts can be destroyed at almost any time */ + uint32_t ref_cnt; + + /* Number of bindings in this descriptor set */ + uint32_t binding_count; + + /* Total number of descriptors */ + uint32_t descriptor_count; + + /* Shader stages affected by this descriptor set */ + uint16_t shader_stages; + + /* Number of buffer views in this descriptor set */ + uint32_t buffer_view_count; + + /* Number of dynamic offsets used by this descriptor set */ + uint16_t dynamic_offset_count; + + /* For each dynamic buffer, which VkShaderStageFlagBits stages are using + * this buffer + */ + VkShaderStageFlags dynamic_offset_stages[MAX_DYNAMIC_BUFFERS]; + + /* Size of the descriptor buffer for this descriptor set */ + uint32_t descriptor_buffer_size; + + /* Bindings in this descriptor set */ + struct anv_descriptor_set_binding_layout binding[0]; +}; + +void anv_descriptor_set_layout_destroy(struct anv_device *device, + struct anv_descriptor_set_layout *layout); + +static inline void +anv_descriptor_set_layout_ref(struct anv_descriptor_set_layout *layout) +{ + assert(layout && layout->ref_cnt >= 1); + p_atomic_inc(&layout->ref_cnt); +} + +static inline void +anv_descriptor_set_layout_unref(struct anv_device *device, + struct anv_descriptor_set_layout *layout) +{ + assert(layout && layout->ref_cnt >= 1); + if (p_atomic_dec_zero(&layout->ref_cnt)) + anv_descriptor_set_layout_destroy(device, layout); +} + +struct anv_descriptor { + VkDescriptorType type; + + union { + struct { + VkImageLayout layout; + struct anv_image_view *image_view; + struct anv_sampler *sampler; + }; + + struct { + struct anv_buffer_view *set_buffer_view; + struct anv_buffer *buffer; + uint64_t offset; + uint64_t range; + }; + + struct anv_buffer_view *buffer_view; + + struct anv_acceleration_structure *accel_struct; + }; +}; + +struct anv_descriptor_set { + struct vk_object_base base; + + struct anv_descriptor_pool *pool; + struct anv_descriptor_set_layout *layout; + + /* Amount of space occupied in the the pool by this descriptor set. It can + * be larger than the size of the descriptor set. + */ + uint32_t size; + + /* State relative to anv_descriptor_pool::bo */ + struct anv_state desc_mem; + /* Surface state for the descriptor buffer */ + struct anv_state desc_surface_state; + + /* Descriptor set address. */ + struct anv_address desc_addr; + + uint32_t buffer_view_count; + struct anv_buffer_view *buffer_views; + + /* Link to descriptor pool's desc_sets list . */ + struct list_head pool_link; + + uint32_t descriptor_count; + struct anv_descriptor descriptors[0]; +}; + +static inline bool +anv_descriptor_set_is_push(struct anv_descriptor_set *set) +{ + return set->pool == NULL; +} + +struct anv_buffer_view { + struct vk_object_base base; + + uint64_t range; /**< VkBufferViewCreateInfo::range */ + + struct anv_address address; + + struct anv_state surface_state; + struct anv_state storage_surface_state; + struct anv_state lowered_storage_surface_state; + + struct brw_image_param lowered_storage_image_param; +}; + +struct anv_push_descriptor_set { + struct anv_descriptor_set set; + + /* Put this field right behind anv_descriptor_set so it fills up the + * descriptors[0] field. */ + struct anv_descriptor descriptors[MAX_PUSH_DESCRIPTORS]; + + /** True if the descriptor set buffer has been referenced by a draw or + * dispatch command. + */ + bool set_used_on_gpu; + + struct anv_buffer_view buffer_views[MAX_PUSH_DESCRIPTORS]; +}; + +static inline struct anv_address +anv_descriptor_set_address(struct anv_descriptor_set *set) +{ + if (anv_descriptor_set_is_push(set)) { + /* We have to flag push descriptor set as used on the GPU + * so that the next time we push descriptors, we grab a new memory. + */ + struct anv_push_descriptor_set *push_set = + (struct anv_push_descriptor_set *)set; + push_set->set_used_on_gpu = true; + } + + return set->desc_addr; +} + +struct anv_descriptor_pool { + struct vk_object_base base; + + uint32_t size; + uint32_t next; + uint32_t free_list; + + struct anv_bo *bo; + struct util_vma_heap bo_heap; + + struct anv_state_stream surface_state_stream; + void *surface_state_free_list; + + struct list_head desc_sets; + + bool host_only; + + char data[0]; +}; + +struct anv_descriptor_template_entry { + /* The type of descriptor in this entry */ + VkDescriptorType type; + + /* Binding in the descriptor set */ + uint32_t binding; + + /* Offset at which to write into the descriptor set binding */ + uint32_t array_element; + + /* Number of elements to write into the descriptor set binding */ + uint32_t array_count; + + /* Offset into the user provided data */ + size_t offset; + + /* Stride between elements into the user provided data */ + size_t stride; +}; + +struct anv_descriptor_update_template { + struct vk_object_base base; + + VkPipelineBindPoint bind_point; + + /* The descriptor set this template corresponds to. This value is only + * valid if the template was created with the templateType + * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET. + */ + uint8_t set; + + /* Number of entries in this template */ + uint32_t entry_count; + + /* Entries of the template */ + struct anv_descriptor_template_entry entries[0]; +}; + +size_t +anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout, + uint32_t var_desc_count); + +uint32_t +anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout, + uint32_t var_desc_count); + +void +anv_descriptor_set_write_image_view(struct anv_device *device, + struct anv_descriptor_set *set, + const VkDescriptorImageInfo * const info, + VkDescriptorType type, + uint32_t binding, + uint32_t element); + +void +anv_descriptor_set_write_buffer_view(struct anv_device *device, + struct anv_descriptor_set *set, + VkDescriptorType type, + struct anv_buffer_view *buffer_view, + uint32_t binding, + uint32_t element); + +void +anv_descriptor_set_write_buffer(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_state_stream *alloc_stream, + VkDescriptorType type, + struct anv_buffer *buffer, + uint32_t binding, + uint32_t element, + VkDeviceSize offset, + VkDeviceSize range); + +void +anv_descriptor_set_write_acceleration_structure(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_acceleration_structure *accel, + uint32_t binding, + uint32_t element); + +void +anv_descriptor_set_write_inline_uniform_data(struct anv_device *device, + struct anv_descriptor_set *set, + uint32_t binding, + const void *data, + size_t offset, + size_t size); + +void +anv_descriptor_set_write_template(struct anv_device *device, + struct anv_descriptor_set *set, + struct anv_state_stream *alloc_stream, + const struct anv_descriptor_update_template *template, + const void *data); + +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 5) +#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 4) +#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) +#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) +#define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1) +#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX + +struct anv_pipeline_binding { + /** Index in the descriptor set + * + * This is a flattened index; the descriptor set layout is already taken + * into account. + */ + uint32_t index; + + /** The descriptor set this surface corresponds to. + * + * The special ANV_DESCRIPTOR_SET_* values above indicates that this + * binding is not a normal descriptor set but something else. + */ + uint8_t set; + + union { + /** Plane in the binding index for images */ + uint8_t plane; + + /** Dynamic offset index (for dynamic UBOs and SSBOs) */ + uint8_t dynamic_offset_index; + }; + + /** For a storage image, whether it requires a lowered surface */ + uint8_t lowered_storage_surface; + + /** Pad to 64 bits so that there are no holes and we can safely memcmp + * assuming POD zero-initialization. + */ + uint8_t pad; +}; + +struct anv_push_range { + /** Index in the descriptor set */ + uint32_t index; + + /** Descriptor set index */ + uint8_t set; + + /** Dynamic offset index (for dynamic UBOs) */ + uint8_t dynamic_offset_index; + + /** Start offset in units of 32B */ + uint8_t start; + + /** Range in units of 32B */ + uint8_t length; +}; + +struct anv_pipeline_layout { + struct vk_object_base base; + + struct { + struct anv_descriptor_set_layout *layout; + uint32_t dynamic_offset_start; + } set[MAX_SETS]; + + uint32_t num_sets; + + unsigned char sha1[20]; +}; + +struct anv_buffer { + struct vk_buffer vk; + + /* Set when bound */ + struct anv_address address; +}; + +enum anv_cmd_dirty_bits { + ANV_CMD_DIRTY_PIPELINE = 1 << 0, + ANV_CMD_DIRTY_INDEX_BUFFER = 1 << 1, + ANV_CMD_DIRTY_RENDER_TARGETS = 1 << 2, + ANV_CMD_DIRTY_XFB_ENABLE = 1 << 3, +}; +typedef enum anv_cmd_dirty_bits anv_cmd_dirty_mask_t; + +enum anv_pipe_bits { + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT = (1 << 0), + ANV_PIPE_STALL_AT_SCOREBOARD_BIT = (1 << 1), + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT = (1 << 2), + ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT = (1 << 3), + ANV_PIPE_VF_CACHE_INVALIDATE_BIT = (1 << 4), + ANV_PIPE_DATA_CACHE_FLUSH_BIT = (1 << 5), + ANV_PIPE_TILE_CACHE_FLUSH_BIT = (1 << 6), + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT = (1 << 10), + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11), + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT = (1 << 12), + ANV_PIPE_DEPTH_STALL_BIT = (1 << 13), + + /* ANV_PIPE_HDC_PIPELINE_FLUSH_BIT is a precise way to ensure prior data + * cache work has completed. Available on Gfx12+. For earlier Gfx we + * must reinterpret this flush as ANV_PIPE_DATA_CACHE_FLUSH_BIT. + */ + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT = (1 << 14), + ANV_PIPE_PSS_STALL_SYNC_BIT = (1 << 15), + + /* + * This bit flush data-port's Untyped L1 data cache (LSC L1). + */ + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT = (1 << 16), + + ANV_PIPE_CS_STALL_BIT = (1 << 20), + ANV_PIPE_END_OF_PIPE_SYNC_BIT = (1 << 21), + + /* This bit does not exist directly in PIPE_CONTROL. Instead it means that + * a flush has happened but not a CS stall. The next time we do any sort + * of invalidation we need to insert a CS stall at that time. Otherwise, + * we would have to CS stall on every flush which could be bad. + */ + ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT = (1 << 22), + + /* This bit does not exist directly in PIPE_CONTROL. It means that render + * target operations related to transfer commands with VkBuffer as + * destination are ongoing. Some operations like copies on the command + * streamer might need to be aware of this to trigger the appropriate stall + * before they can proceed with the copy. + */ + ANV_PIPE_RENDER_TARGET_BUFFER_WRITES = (1 << 23), + + /* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12 + * AUX-TT data has changed and we need to invalidate AUX-TT data. This is + * done by writing the AUX-TT register. + */ + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT = (1 << 24), + + /* This bit does not exist directly in PIPE_CONTROL. It means that a + * PIPE_CONTROL with a post-sync operation will follow. This is used to + * implement a workaround for Gfx9. + */ + ANV_PIPE_POST_SYNC_BIT = (1 << 25), +}; + +#define ANV_PIPE_FLUSH_BITS ( \ + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \ + ANV_PIPE_DATA_CACHE_FLUSH_BIT | \ + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \ + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT | \ + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \ + ANV_PIPE_TILE_CACHE_FLUSH_BIT) + +#define ANV_PIPE_STALL_BITS ( \ + ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \ + ANV_PIPE_DEPTH_STALL_BIT | \ + ANV_PIPE_CS_STALL_BIT) + +#define ANV_PIPE_INVALIDATE_BITS ( \ + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \ + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | \ + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) + +enum intel_ds_stall_flag +anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits); + +static inline enum anv_pipe_bits +anv_pipe_flush_bits_for_access_flags(struct anv_device *device, + VkAccessFlags2 flags) +{ + enum anv_pipe_bits pipe_bits = 0; + + u_foreach_bit64(b, flags) { + switch ((VkAccessFlags2)BITFIELD64_BIT(b)) { + case VK_ACCESS_2_SHADER_WRITE_BIT: + case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT: + /* We're transitioning a buffer that was previously used as write + * destination through the data port. To make its content available + * to future operations, flush the hdc pipeline. + */ + pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; + pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT: + /* We're transitioning a buffer that was previously used as render + * target. To make its content available to future operations, flush + * the render target cache. + */ + pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT: + /* We're transitioning a buffer that was previously used as depth + * buffer. To make its content available to future operations, flush + * the depth cache. + */ + pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_TRANSFER_WRITE_BIT: + /* We're transitioning a buffer that was previously used as a + * transfer write destination. Generic write operations include color + * & depth operations as well as buffer operations like : + * - vkCmdClearColorImage() + * - vkCmdClearDepthStencilImage() + * - vkCmdBlitImage() + * - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*() + * + * Most of these operations are implemented using Blorp which writes + * through the render target, so flush that cache to make it visible + * to future operations. And for depth related operations we also + * need to flush the depth cache. + */ + pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_MEMORY_WRITE_BIT: + /* We're transitioning a buffer for generic write operations. Flush + * all the caches. + */ + pipe_bits |= ANV_PIPE_FLUSH_BITS; + break; + case VK_ACCESS_2_HOST_WRITE_BIT: + /* We're transitioning a buffer for access by CPU. Invalidate + * all the caches. Since data and tile caches don't have invalidate, + * we are forced to flush those as well. + */ + pipe_bits |= ANV_PIPE_FLUSH_BITS; + pipe_bits |= ANV_PIPE_INVALIDATE_BITS; + break; + case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: + case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT: + /* We're transitioning a buffer written either from VS stage or from + * the command streamer (see CmdEndTransformFeedbackEXT), we just + * need to stall the CS. + */ + pipe_bits |= ANV_PIPE_CS_STALL_BIT; + break; + default: + break; /* Nothing to do */ + } + } + + return pipe_bits; +} + +static inline enum anv_pipe_bits +anv_pipe_invalidate_bits_for_access_flags(struct anv_device *device, + VkAccessFlags2 flags) +{ + enum anv_pipe_bits pipe_bits = 0; + + u_foreach_bit64(b, flags) { + switch ((VkAccessFlags2)BITFIELD64_BIT(b)) { + case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT: + /* Indirect draw commands take a buffer as input that we're going to + * read from the command streamer to load some of the HW registers + * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a + * command streamer stall so that all the cache flushes have + * completed before the command streamer loads from memory. + */ + pipe_bits |= ANV_PIPE_CS_STALL_BIT; + /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex + * through a vertex buffer, so invalidate that cache. + */ + pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a + * UBO from the buffer, so we need to invalidate constant cache. + */ + pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; + pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT; + /* Tile cache flush needed For CmdDipatchIndirect since command + * streamer and vertex fetch aren't L3 coherent. + */ + pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_INDEX_READ_BIT: + case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT: + /* We transitioning a buffer to be used for as input for vkCmdDraw* + * commands, so we invalidate the VF cache to make sure there is no + * stale data when we start rendering. + */ + pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + break; + case VK_ACCESS_2_UNIFORM_READ_BIT: + /* We transitioning a buffer to be used as uniform data. Because + * uniform is accessed through the data port & sampler, we need to + * invalidate the texture cache (sampler) & constant cache (data + * port) to avoid stale data. + */ + pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; + if (device->physical->compiler->indirect_ubos_use_sampler) { + pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + } else { + pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; + pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT; + } + break; + case VK_ACCESS_2_SHADER_READ_BIT: + case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT: + case VK_ACCESS_2_TRANSFER_READ_BIT: + /* Transitioning a buffer to be read through the sampler, so + * invalidate the texture cache, we don't want any stale data. + */ + pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + break; + case VK_ACCESS_2_MEMORY_READ_BIT: + /* Transitioning a buffer for generic read, invalidate all the + * caches. + */ + pipe_bits |= ANV_PIPE_INVALIDATE_BITS; + break; + case VK_ACCESS_2_MEMORY_WRITE_BIT: + /* Generic write, make sure all previously written things land in + * memory. + */ + pipe_bits |= ANV_PIPE_FLUSH_BITS; + break; + case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT: + case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT: + /* Transitioning a buffer for conditional rendering or transform + * feedback. We'll load the content of this buffer into HW registers + * using the command streamer, so we need to stall the command + * streamer , so we need to stall the command streamer to make sure + * any in-flight flush operations have completed. + */ + pipe_bits |= ANV_PIPE_CS_STALL_BIT; + pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT; + break; + case VK_ACCESS_2_HOST_READ_BIT: + /* We're transitioning a buffer that was written by CPU. Flush + * all the caches. + */ + pipe_bits |= ANV_PIPE_FLUSH_BITS; + break; + case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT: + /* We're transitioning a buffer to be written by the streamout fixed + * function. This one is apparently not L3 coherent, so we need a + * tile cache flush to make sure any previous write is not going to + * create WaW hazards. + */ + pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT; + break; + default: + break; /* Nothing to do */ + } + } + + return pipe_bits; +} + +#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV ( \ + VK_IMAGE_ASPECT_COLOR_BIT | \ + VK_IMAGE_ASPECT_PLANE_0_BIT | \ + VK_IMAGE_ASPECT_PLANE_1_BIT | \ + VK_IMAGE_ASPECT_PLANE_2_BIT) +#define VK_IMAGE_ASPECT_PLANES_BITS_ANV ( \ + VK_IMAGE_ASPECT_PLANE_0_BIT | \ + VK_IMAGE_ASPECT_PLANE_1_BIT | \ + VK_IMAGE_ASPECT_PLANE_2_BIT) + +struct anv_vertex_binding { + struct anv_buffer * buffer; + VkDeviceSize offset; + VkDeviceSize size; +}; + +struct anv_xfb_binding { + struct anv_buffer * buffer; + VkDeviceSize offset; + VkDeviceSize size; +}; + +struct anv_push_constants { + /** Push constant data provided by the client through vkPushConstants */ + uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE]; + + /** Dynamic offsets for dynamic UBOs and SSBOs */ + uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; + + /* Robust access pushed registers. */ + uint64_t push_reg_mask[MESA_SHADER_STAGES]; + + /** Ray query globals (RT_DISPATCH_GLOBALS) */ + uint64_t ray_query_globals; + + /* Base addresses for descriptor sets */ + uint64_t desc_sets[MAX_SETS]; + + struct { + /** Base workgroup ID + * + * Used for vkCmdDispatchBase. + */ + uint32_t base_work_group_id[3]; + + /** Subgroup ID + * + * This is never set by software but is implicitly filled out when + * uploading the push constants for compute shaders. + */ + uint32_t subgroup_id; + } cs; +}; + +struct anv_surface_state { + struct anv_state state; + /** Address of the surface referred to by this state + * + * This address is relative to the start of the BO. + */ + struct anv_address address; + /* Address of the aux surface, if any + * + * This field is ANV_NULL_ADDRESS if and only if no aux surface exists. + * + * With the exception of gfx8, the bottom 12 bits of this address' offset + * include extra aux information. + */ + struct anv_address aux_address; + /* Address of the clear color, if any + * + * This address is relative to the start of the BO. + */ + struct anv_address clear_address; +}; + +struct anv_attachment { + VkFormat vk_format; + const struct anv_image_view *iview; + VkImageLayout layout; + enum isl_aux_usage aux_usage; + struct anv_surface_state surface_state; + + VkResolveModeFlagBits resolve_mode; + const struct anv_image_view *resolve_iview; + VkImageLayout resolve_layout; +}; + +/** State tracking for vertex buffer flushes + * + * On Gfx8-9, the VF cache only considers the bottom 32 bits of memory + * addresses. If you happen to have two vertex buffers which get placed + * exactly 4 GiB apart and use them in back-to-back draw calls, you can get + * collisions. In order to solve this problem, we track vertex address ranges + * which are live in the cache and invalidate the cache if one ever exceeds 32 + * bits. + */ +struct anv_vb_cache_range { + /* Virtual address at which the live vertex buffer cache range starts for + * this vertex buffer index. + */ + uint64_t start; + + /* Virtual address of the byte after where vertex buffer cache range ends. + * This is exclusive such that end - start is the size of the range. + */ + uint64_t end; +}; + +/* Check whether we need to apply the Gfx8-9 vertex buffer workaround*/ +static inline bool +anv_gfx8_9_vb_cache_range_needs_workaround(struct anv_vb_cache_range *bound, + struct anv_vb_cache_range *dirty, + struct anv_address vb_address, + uint32_t vb_size) +{ + if (vb_size == 0) { + bound->start = 0; + bound->end = 0; + return false; + } + + assert(vb_address.bo && anv_bo_is_pinned(vb_address.bo)); + bound->start = intel_48b_address(anv_address_physical(vb_address)); + bound->end = bound->start + vb_size; + assert(bound->end > bound->start); /* No overflow */ + + /* Align everything to a cache line */ + bound->start &= ~(64ull - 1ull); + bound->end = align_u64(bound->end, 64); + + /* Compute the dirty range */ + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + + /* If our range is larger than 32 bits, we have to flush */ + assert(bound->end - bound->start <= (1ull << 32)); + return (dirty->end - dirty->start) > (1ull << 32); +} + +/** State tracking for particular pipeline bind point + * + * This struct is the base struct for anv_cmd_graphics_state and + * anv_cmd_compute_state. These are used to track state which is bound to a + * particular type of pipeline. Generic state that applies per-stage such as + * binding table offsets and push constants is tracked generically with a + * per-stage array in anv_cmd_state. + */ +struct anv_cmd_pipeline_state { + struct anv_descriptor_set *descriptors[MAX_SETS]; + struct anv_push_descriptor_set *push_descriptors[MAX_SETS]; + + struct anv_push_constants push_constants; + + /* Push constant state allocated when flushing push constants. */ + struct anv_state push_constants_state; +}; + +/** State tracking for graphics pipeline + * + * This has anv_cmd_pipeline_state as a base struct to track things which get + * bound to a graphics pipeline. Along with general pipeline bind point state + * which is in the anv_cmd_pipeline_state base struct, it also contains other + * state which is graphics-specific. + */ +struct anv_cmd_graphics_state { + struct anv_cmd_pipeline_state base; + + struct anv_graphics_pipeline *pipeline; + + VkRenderingFlags rendering_flags; + VkRect2D render_area; + uint32_t layer_count; + uint32_t samples; + uint32_t view_mask; + uint32_t color_att_count; + struct anv_state att_states; + struct anv_attachment color_att[MAX_RTS]; + struct anv_attachment depth_att; + struct anv_attachment stencil_att; + struct anv_state null_surface_state; + + anv_cmd_dirty_mask_t dirty; + uint32_t vb_dirty; + + struct anv_vb_cache_range ib_bound_range; + struct anv_vb_cache_range ib_dirty_range; + struct anv_vb_cache_range vb_bound_ranges[33]; + struct anv_vb_cache_range vb_dirty_ranges[33]; + + uint32_t restart_index; + + VkShaderStageFlags push_constant_stages; + + uint32_t primitive_topology; + + struct anv_buffer *index_buffer; + uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */ + uint32_t index_offset; + + struct vk_sample_locations_state sample_locations; +}; + +enum anv_depth_reg_mode { + ANV_DEPTH_REG_MODE_UNKNOWN = 0, + ANV_DEPTH_REG_MODE_HW_DEFAULT, + ANV_DEPTH_REG_MODE_D16_1X_MSAA, +}; + +/** State tracking for compute pipeline + * + * This has anv_cmd_pipeline_state as a base struct to track things which get + * bound to a compute pipeline. Along with general pipeline bind point state + * which is in the anv_cmd_pipeline_state base struct, it also contains other + * state which is compute-specific. + */ +struct anv_cmd_compute_state { + struct anv_cmd_pipeline_state base; + + struct anv_compute_pipeline *pipeline; + + bool pipeline_dirty; + + struct anv_state push_data; + + struct anv_address num_workgroups; +}; + +struct anv_cmd_ray_tracing_state { + struct anv_cmd_pipeline_state base; + + struct anv_ray_tracing_pipeline *pipeline; + + bool pipeline_dirty; + + struct { + struct anv_bo *bo; + struct brw_rt_scratch_layout layout; + } scratch; +}; + +/** State required while building cmd buffer */ +struct anv_cmd_state { + /* PIPELINE_SELECT.PipelineSelection */ + uint32_t current_pipeline; + const struct intel_l3_config * current_l3_config; + uint32_t last_aux_map_state; + + struct anv_cmd_graphics_state gfx; + struct anv_cmd_compute_state compute; + struct anv_cmd_ray_tracing_state rt; + + enum anv_pipe_bits pending_pipe_bits; + VkShaderStageFlags descriptors_dirty; + VkShaderStageFlags push_constants_dirty; + + struct anv_vertex_binding vertex_bindings[MAX_VBS]; + bool xfb_enabled; + struct anv_xfb_binding xfb_bindings[MAX_XFB_BUFFERS]; + struct anv_state binding_tables[MESA_VULKAN_SHADER_STAGES]; + struct anv_state samplers[MESA_VULKAN_SHADER_STAGES]; + + unsigned char sampler_sha1s[MESA_VULKAN_SHADER_STAGES][20]; + unsigned char surface_sha1s[MESA_VULKAN_SHADER_STAGES][20]; + unsigned char push_sha1s[MESA_VULKAN_SHADER_STAGES][20]; + + /** + * Whether or not the gfx8 PMA fix is enabled. We ensure that, at the top + * of any command buffer it is disabled by disabling it in EndCommandBuffer + * and before invoking the secondary in ExecuteCommands. + */ + bool pma_fix_enabled; + + /** + * Whether or not we know for certain that HiZ is enabled for the current + * subpass. If, for whatever reason, we are unsure as to whether HiZ is + * enabled or not, this will be false. + */ + bool hiz_enabled; + + /* We ensure the registers for the gfx12 D16 fix are initialized at the + * first non-NULL depth stencil packet emission of every command buffer. + * For secondary command buffer execution, we transfer the state from the + * last command buffer to the primary (if known). + */ + enum anv_depth_reg_mode depth_reg_mode; + + bool conditional_render_enabled; + + /** + * Last rendering scale argument provided to + * genX(cmd_buffer_emit_hashing_mode)(). + */ + unsigned current_hash_scale; + + /** + * A buffer used for spill/fill of ray queries. + */ + struct anv_bo * ray_query_shadow_bo; +}; + +#define ANV_MIN_CMD_BUFFER_BATCH_SIZE 8192 +#define ANV_MAX_CMD_BUFFER_BATCH_SIZE (16 * 1024 * 1024) + +enum anv_cmd_buffer_exec_mode { + ANV_CMD_BUFFER_EXEC_MODE_PRIMARY, + ANV_CMD_BUFFER_EXEC_MODE_EMIT, + ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT, + ANV_CMD_BUFFER_EXEC_MODE_CHAIN, + ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN, + ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN, +}; + +struct anv_measure_batch; + +struct anv_cmd_buffer { + struct vk_command_buffer vk; + + struct anv_device * device; + struct anv_queue_family * queue_family; + + struct anv_batch batch; + + /* Pointer to the location in the batch where MI_BATCH_BUFFER_END was + * recorded upon calling vkEndCommandBuffer(). This is useful if we need to + * rewrite the end to chain multiple batch together at vkQueueSubmit(). + */ + void * batch_end; + + /* Fields required for the actual chain of anv_batch_bo's. + * + * These fields are initialized by anv_cmd_buffer_init_batch_bo_chain(). + */ + struct list_head batch_bos; + enum anv_cmd_buffer_exec_mode exec_mode; + + /* A vector of anv_batch_bo pointers for every batch or surface buffer + * referenced by this command buffer + * + * initialized by anv_cmd_buffer_init_batch_bo_chain() + */ + struct u_vector seen_bbos; + + /* A vector of int32_t's for every block of binding tables. + * + * initialized by anv_cmd_buffer_init_batch_bo_chain() + */ + struct u_vector bt_block_states; + struct anv_state bt_next; + + struct anv_reloc_list surface_relocs; + /** Last seen surface state block pool center bo offset */ + uint32_t last_ss_pool_center; + + /* Serial for tracking buffer completion */ + uint32_t serial; + + /* Stream objects for storing temporary data */ + struct anv_state_stream surface_state_stream; + struct anv_state_stream dynamic_state_stream; + struct anv_state_stream general_state_stream; + + VkCommandBufferUsageFlags usage_flags; + + struct anv_query_pool *perf_query_pool; + + struct anv_cmd_state state; + + struct anv_address return_addr; + + /* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */ + uint64_t intel_perf_marker; + + struct anv_measure_batch *measure; + + /** + * KHR_performance_query requires self modifying command buffers and this + * array has the location of modifying commands to the query begin and end + * instructions storing performance counters. The array length is + * anv_physical_device::n_perf_query_commands. + */ + struct mi_address_token *self_mod_locations; + + /** + * Index tracking which of the self_mod_locations items have already been + * used. + */ + uint32_t perf_reloc_idx; + + /** + * Sum of all the anv_batch_bo sizes allocated for this command buffer. + * Used to increase allocation size for long command buffers. + */ + uint32_t total_batch_size; + + /** + * + */ + struct u_trace trace; +}; + +/* Determine whether we can chain a given cmd_buffer to another one. We need + * softpin and we also need to make sure that we can edit the end of the batch + * to point to next one, which requires the command buffer to not be used + * simultaneously. + */ +static inline bool +anv_cmd_buffer_is_chainable(struct anv_cmd_buffer *cmd_buffer) +{ + return !anv_use_relocations(cmd_buffer->device->physical) && + !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT); +} + +VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); +void anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); +void anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); +void anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer); +void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, + struct anv_cmd_buffer *secondary); +void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer); +VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue, + struct anv_cmd_buffer *cmd_buffer, + const VkSemaphore *in_semaphores, + const uint64_t *in_wait_values, + uint32_t num_in_semaphores, + const VkSemaphore *out_semaphores, + const uint64_t *out_signal_values, + uint32_t num_out_semaphores, + VkFence fence, + int perf_query_pass); + +VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer); + +struct anv_state anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer, + const void *data, uint32_t size, uint32_t alignment); +struct anv_state anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer, + uint32_t *a, uint32_t *b, + uint32_t dwords, uint32_t alignment); + +struct anv_address +anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer); +struct anv_state +anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer, + uint32_t entries, uint32_t *state_offset); +struct anv_state +anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer); +struct anv_state +anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer, + uint32_t size, uint32_t alignment); + +VkResult +anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer); + +void anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer); + +struct anv_state +anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer); +struct anv_state +anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer); + +VkResult +anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer, + uint32_t num_entries, + uint32_t *state_offset, + struct anv_state *bt_state); + +void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer); + +void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer); + +enum anv_bo_sync_state { + /** Indicates that this is a new (or newly reset fence) */ + ANV_BO_SYNC_STATE_RESET, + + /** Indicates that this fence has been submitted to the GPU but is still + * (as far as we know) in use by the GPU. + */ + ANV_BO_SYNC_STATE_SUBMITTED, + + ANV_BO_SYNC_STATE_SIGNALED, +}; + +struct anv_bo_sync { + struct vk_sync sync; + + enum anv_bo_sync_state state; + struct anv_bo *bo; +}; + +extern const struct vk_sync_type anv_bo_sync_type; + +static inline bool +vk_sync_is_anv_bo_sync(const struct vk_sync *sync) +{ + return sync->type == &anv_bo_sync_type; +} + +VkResult anv_create_sync_for_memory(struct vk_device *device, + VkDeviceMemory memory, + bool signal_memory, + struct vk_sync **sync_out); + +struct anv_event { + struct vk_object_base base; + uint64_t semaphore; + struct anv_state state; +}; + +#define ANV_STAGE_MASK ((1 << MESA_VULKAN_SHADER_STAGES) - 1) + +#define anv_foreach_stage(stage, stage_bits) \ + for (gl_shader_stage stage, \ + __tmp = (gl_shader_stage)((stage_bits) & ANV_STAGE_MASK); \ + stage = __builtin_ffs(__tmp) - 1, __tmp; \ + __tmp &= ~(1 << (stage))) + +struct anv_pipeline_bind_map { + unsigned char surface_sha1[20]; + unsigned char sampler_sha1[20]; + unsigned char push_sha1[20]; + + uint32_t surface_count; + uint32_t sampler_count; + + struct anv_pipeline_binding * surface_to_descriptor; + struct anv_pipeline_binding * sampler_to_descriptor; + + struct anv_push_range push_ranges[4]; +}; + +struct anv_shader_bin { + struct vk_pipeline_cache_object base; + + gl_shader_stage stage; + + struct anv_state kernel; + uint32_t kernel_size; + + const struct brw_stage_prog_data *prog_data; + uint32_t prog_data_size; + + struct brw_compile_stats stats[3]; + uint32_t num_stats; + + struct nir_xfb_info *xfb_info; + + struct anv_pipeline_bind_map bind_map; +}; + +struct anv_shader_bin * +anv_shader_bin_create(struct anv_device *device, + gl_shader_stage stage, + const void *key, uint32_t key_size, + const void *kernel, uint32_t kernel_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + const struct brw_compile_stats *stats, uint32_t num_stats, + const struct nir_xfb_info *xfb_info, + const struct anv_pipeline_bind_map *bind_map); + +static inline void +anv_shader_bin_ref(struct anv_shader_bin *shader) +{ + vk_pipeline_cache_object_ref(&shader->base); +} + +static inline void +anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader) +{ + vk_pipeline_cache_object_unref(&shader->base); +} + +#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({ \ + assert((local_arg_offset) % 8 == 0); \ + const struct brw_bs_prog_data *prog_data = \ + brw_bs_prog_data_const(bin->prog_data); \ + assert(prog_data->simd_size == 8 || prog_data->simd_size == 16); \ + \ + (struct GFX_BINDLESS_SHADER_RECORD) { \ + .OffsetToLocalArguments = (local_arg_offset) / 8, \ + .BindlessShaderDispatchMode = \ + prog_data->simd_size == 16 ? RT_SIMD16 : RT_SIMD8, \ + .KernelStartPointer = bin->kernel.offset, \ + }; \ +}) + +struct anv_pipeline_executable { + gl_shader_stage stage; + + struct brw_compile_stats stats; + + char *nir; + char *disasm; +}; + +enum anv_pipeline_type { + ANV_PIPELINE_GRAPHICS, + ANV_PIPELINE_COMPUTE, + ANV_PIPELINE_RAY_TRACING, +}; + +struct anv_pipeline { + struct vk_object_base base; + + struct anv_device * device; + + struct anv_batch batch; + struct anv_reloc_list batch_relocs; + + void * mem_ctx; + + enum anv_pipeline_type type; + VkPipelineCreateFlags flags; + + uint32_t ray_queries; + + struct util_dynarray executables; + + const struct intel_l3_config * l3_config; +}; + +struct anv_graphics_pipeline { + struct anv_pipeline base; + + /* Shaders */ + struct anv_shader_bin * shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT]; + + VkShaderStageFlags active_stages; + + struct vk_sample_locations_state sample_locations; + struct vk_dynamic_graphics_state dynamic_state; + + /* These fields are required with dynamic primitive topology, + * rasterization_samples used only with gen < 8. + */ + VkLineRasterizationModeEXT line_mode; + VkPolygonMode polygon_mode; + uint32_t patch_control_points; + uint32_t rasterization_samples; + + VkColorComponentFlags color_comp_writes[MAX_RTS]; + + uint32_t view_mask; + uint32_t instance_multiplier; + + bool depth_clamp_enable; + bool depth_clip_enable; + bool kill_pixel; + bool force_fragment_thread_dispatch; + bool negative_one_to_one; + + uint32_t vb_used; + struct anv_pipeline_vertex_binding { + uint32_t stride; + bool instanced; + uint32_t instance_divisor; + } vb[MAX_VBS]; + + /* Pre computed CS instructions that can directly be copied into + * anv_cmd_buffer. + */ + uint32_t batch_data[512]; + + /* Pre packed CS instructions & structures that need to be merged later + * with dynamic state. + */ + struct { + uint32_t sf[7]; + uint32_t clip[4]; + uint32_t xfb_bo_pitch[4]; + uint32_t wm[3]; + uint32_t blend_state[MAX_RTS * 2]; + uint32_t streamout_state[3]; + } gfx7; + + struct { + uint32_t sf[4]; + uint32_t raster[5]; + uint32_t wm[2]; + uint32_t ps_blend[2]; + uint32_t blend_state[1 + MAX_RTS * 2]; + uint32_t streamout_state[5]; + } gfx8; +}; + +struct anv_compute_pipeline { + struct anv_pipeline base; + + struct anv_shader_bin * cs; + uint32_t batch_data[9]; + uint32_t interface_descriptor_data[8]; +}; + +struct anv_rt_shader_group { + VkRayTracingShaderGroupTypeKHR type; + + struct anv_shader_bin *general; + struct anv_shader_bin *closest_hit; + struct anv_shader_bin *any_hit; + struct anv_shader_bin *intersection; + + /* VK_KHR_ray_tracing requires shaderGroupHandleSize == 32 */ + uint32_t handle[8]; +}; + +struct anv_ray_tracing_pipeline { + struct anv_pipeline base; + + /* All shaders in the pipeline */ + struct util_dynarray shaders; + + uint32_t group_count; + struct anv_rt_shader_group * groups; + + /* If non-zero, this is the default computed stack size as per the stack + * size computation in the Vulkan spec. If zero, that indicates that the + * client has requested a dynamic stack size. + */ + uint32_t stack_size; +}; + +#define ANV_DECL_PIPELINE_DOWNCAST(pipe_type, pipe_enum) \ + static inline struct anv_##pipe_type##_pipeline * \ + anv_pipeline_to_##pipe_type(struct anv_pipeline *pipeline) \ + { \ + assert(pipeline->type == pipe_enum); \ + return (struct anv_##pipe_type##_pipeline *) pipeline; \ + } + +ANV_DECL_PIPELINE_DOWNCAST(graphics, ANV_PIPELINE_GRAPHICS) +ANV_DECL_PIPELINE_DOWNCAST(compute, ANV_PIPELINE_COMPUTE) +ANV_DECL_PIPELINE_DOWNCAST(ray_tracing, ANV_PIPELINE_RAY_TRACING) + +static inline bool +anv_pipeline_has_stage(const struct anv_graphics_pipeline *pipeline, + gl_shader_stage stage) +{ + return (pipeline->active_stages & mesa_to_vk_shader_stage(stage)) != 0; +} + +static inline bool +anv_pipeline_is_primitive(const struct anv_graphics_pipeline *pipeline) +{ + return anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX); +} + +static inline bool +anv_pipeline_is_mesh(const struct anv_graphics_pipeline *pipeline) +{ + return anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH); +} + +static inline bool +anv_cmd_buffer_all_color_write_masked(const struct anv_cmd_buffer *cmd_buffer) +{ + const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint8_t color_writes = dyn->cb.color_write_enables; + + /* All writes disabled through vkCmdSetColorWriteEnableEXT */ + if ((color_writes & ((1u << state->color_att_count) - 1)) == 0) + return true; + + /* Or all write masks are empty */ + for (uint32_t i = 0; i < state->color_att_count; i++) { + if (state->pipeline->color_comp_writes[i] != 0) + return false; + } + + return true; +} + +#define ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(prefix, stage) \ +static inline const struct brw_##prefix##_prog_data * \ +get_##prefix##_prog_data(const struct anv_graphics_pipeline *pipeline) \ +{ \ + if (anv_pipeline_has_stage(pipeline, stage)) { \ + return (const struct brw_##prefix##_prog_data *) \ + pipeline->shaders[stage]->prog_data; \ + } else { \ + return NULL; \ + } \ +} + +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(vs, MESA_SHADER_VERTEX) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tcs, MESA_SHADER_TESS_CTRL) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tes, MESA_SHADER_TESS_EVAL) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(gs, MESA_SHADER_GEOMETRY) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(wm, MESA_SHADER_FRAGMENT) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(mesh, MESA_SHADER_MESH) +ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(task, MESA_SHADER_TASK) + +static inline const struct brw_cs_prog_data * +get_cs_prog_data(const struct anv_compute_pipeline *pipeline) +{ + assert(pipeline->cs); + return (const struct brw_cs_prog_data *) pipeline->cs->prog_data; +} + +static inline const struct brw_vue_prog_data * +anv_pipeline_get_last_vue_prog_data(const struct anv_graphics_pipeline *pipeline) +{ + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) + return &get_gs_prog_data(pipeline)->base; + else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) + return &get_tes_prog_data(pipeline)->base; + else + return &get_vs_prog_data(pipeline)->base; +} + +VkResult +anv_device_init_rt_shaders(struct anv_device *device); + +void +anv_device_finish_rt_shaders(struct anv_device *device); + +VkResult +anv_pipeline_init(struct anv_pipeline *pipeline, + struct anv_device *device, + enum anv_pipeline_type type, + VkPipelineCreateFlags flags, + const VkAllocationCallbacks *pAllocator); + +void +anv_pipeline_finish(struct anv_pipeline *pipeline, + struct anv_device *device, + const VkAllocationCallbacks *pAllocator); + +struct anv_format_plane { + enum isl_format isl_format:16; + struct isl_swizzle swizzle; + + /* Whether this plane contains chroma channels */ + bool has_chroma; + + /* For downscaling of YUV planes */ + uint8_t denominator_scales[2]; + + /* How to map sampled ycbcr planes to a single 4 component element. */ + struct isl_swizzle ycbcr_swizzle; + + /* What aspect is associated to this plane */ + VkImageAspectFlags aspect; +}; + + +struct anv_format { + struct anv_format_plane planes[3]; + VkFormat vk_format; + uint8_t n_planes; + bool can_ycbcr; +}; + +static inline void +anv_assert_valid_aspect_set(VkImageAspectFlags aspects) +{ + if (util_bitcount(aspects) == 1) { + assert(aspects & (VK_IMAGE_ASPECT_COLOR_BIT | + VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT | + VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT)); + } else if (aspects & VK_IMAGE_ASPECT_PLANES_BITS_ANV) { + assert(aspects == VK_IMAGE_ASPECT_PLANE_0_BIT || + aspects == (VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT) || + aspects == (VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT)); + } else { + assert(aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | + VK_IMAGE_ASPECT_STENCIL_BIT)); + } +} + +/** + * Return the aspect's plane relative to all_aspects. For an image, for + * instance, all_aspects would be the set of aspects in the image. For + * an image view, all_aspects would be the subset of aspects represented + * by that particular view. + */ +static inline uint32_t +anv_aspect_to_plane(VkImageAspectFlags all_aspects, + VkImageAspectFlagBits aspect) +{ + anv_assert_valid_aspect_set(all_aspects); + assert(util_bitcount(aspect) == 1); + assert(!(aspect & ~all_aspects)); + + /* Because we always put image and view planes in aspect-bit-order, the + * plane index is the number of bits in all_aspects before aspect. + */ + return util_bitcount(all_aspects & (aspect - 1)); +} + +#define anv_foreach_image_aspect_bit(b, image, aspects) \ + u_foreach_bit(b, vk_image_expand_aspect_mask(&(image)->vk, aspects)) + +const struct anv_format * +anv_get_format(VkFormat format); + +static inline uint32_t +anv_get_format_planes(VkFormat vk_format) +{ + const struct anv_format *format = anv_get_format(vk_format); + + return format != NULL ? format->n_planes : 0; +} + +struct anv_format_plane +anv_get_format_plane(const struct intel_device_info *devinfo, + VkFormat vk_format, uint32_t plane, + VkImageTiling tiling); + +struct anv_format_plane +anv_get_format_aspect(const struct intel_device_info *devinfo, + VkFormat vk_format, + VkImageAspectFlagBits aspect, VkImageTiling tiling); + +static inline enum isl_format +anv_get_isl_format(const struct intel_device_info *devinfo, VkFormat vk_format, + VkImageAspectFlags aspect, VkImageTiling tiling) +{ + return anv_get_format_aspect(devinfo, vk_format, aspect, tiling).isl_format; +} + +bool anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo, + VkImageCreateFlags create_flags, + VkFormat vk_format, VkImageTiling vk_tiling, + VkImageUsageFlags vk_usage, + const VkImageFormatListCreateInfo *fmt_list); + +extern VkFormat +vk_format_from_android(unsigned android_format, unsigned android_usage); + +static inline struct isl_swizzle +anv_swizzle_for_render(struct isl_swizzle swizzle) +{ + /* Sometimes the swizzle will have alpha map to one. We do this to fake + * RGB as RGBA for texturing + */ + assert(swizzle.a == ISL_CHANNEL_SELECT_ONE || + swizzle.a == ISL_CHANNEL_SELECT_ALPHA); + + /* But it doesn't matter what we render to that channel */ + swizzle.a = ISL_CHANNEL_SELECT_ALPHA; + + return swizzle; +} + +void +anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm); + +/** + * Describes how each part of anv_image will be bound to memory. + */ +struct anv_image_memory_range { + /** + * Disjoint bindings into which each portion of the image will be bound. + * + * Binding images to memory can be complicated and invold binding different + * portions of the image to different memory objects or regions. For most + * images, everything lives in the MAIN binding and gets bound by + * vkBindImageMemory. For disjoint multi-planar images, each plane has + * a unique, disjoint binding and gets bound by vkBindImageMemory2 with + * VkBindImagePlaneMemoryInfo. There may also exist bits of memory which are + * implicit or driver-managed and live in special-case bindings. + */ + enum anv_image_memory_binding { + /** + * Used if and only if image is not multi-planar disjoint. Bound by + * vkBindImageMemory2 without VkBindImagePlaneMemoryInfo. + */ + ANV_IMAGE_MEMORY_BINDING_MAIN, + + /** + * Used if and only if image is multi-planar disjoint. Bound by + * vkBindImageMemory2 with VkBindImagePlaneMemoryInfo. + */ + ANV_IMAGE_MEMORY_BINDING_PLANE_0, + ANV_IMAGE_MEMORY_BINDING_PLANE_1, + ANV_IMAGE_MEMORY_BINDING_PLANE_2, + + /** + * Driver-private bo. In special cases we may store the aux surface and/or + * aux state in this binding. + */ + ANV_IMAGE_MEMORY_BINDING_PRIVATE, + + /** Sentinel */ + ANV_IMAGE_MEMORY_BINDING_END, + } binding; + + /** + * Offset is relative to the start of the binding created by + * vkBindImageMemory, not to the start of the bo. + */ + uint64_t offset; + + uint64_t size; + uint32_t alignment; +}; + +/** + * Subsurface of an anv_image. + */ +struct anv_surface { + struct isl_surf isl; + struct anv_image_memory_range memory_range; +}; + +static inline bool MUST_CHECK +anv_surface_is_valid(const struct anv_surface *surface) +{ + return surface->isl.size_B > 0 && surface->memory_range.size > 0; +} + +struct anv_image { + struct vk_image vk; + + uint32_t n_planes; + + /** + * Image has multi-planar format and was created with + * VK_IMAGE_CREATE_DISJOINT_BIT. + */ + bool disjoint; + + /** + * Image was imported from an struct AHardwareBuffer. We have to delay + * final image creation until bind time. + */ + bool from_ahb; + + /** + * Image was imported from gralloc with VkNativeBufferANDROID. The gralloc bo + * must be released when the image is destroyed. + */ + bool from_gralloc; + + /** + * The memory bindings created by vkCreateImage and vkBindImageMemory. + * + * For details on the image's memory layout, see check_memory_bindings(). + * + * vkCreateImage constructs the `memory_range` for each + * anv_image_memory_binding. After vkCreateImage, each binding is valid if + * and only if `memory_range::size > 0`. + * + * vkBindImageMemory binds each valid `memory_range` to an `address`. + * Usually, the app will provide the address via the parameters of + * vkBindImageMemory. However, special-case bindings may be bound to + * driver-private memory. + */ + struct anv_image_binding { + struct anv_image_memory_range memory_range; + struct anv_address address; + } bindings[ANV_IMAGE_MEMORY_BINDING_END]; + + /** + * Image subsurfaces + * + * For each foo, anv_image::planes[x].surface is valid if and only if + * anv_image::aspects has a x aspect. Refer to anv_image_aspect_to_plane() + * to figure the number associated with a given aspect. + * + * The hardware requires that the depth buffer and stencil buffer be + * separate surfaces. From Vulkan's perspective, though, depth and stencil + * reside in the same VkImage. To satisfy both the hardware and Vulkan, we + * allocate the depth and stencil buffers as separate surfaces in the same + * bo. + */ + struct anv_image_plane { + struct anv_surface primary_surface; + + /** + * A surface which shadows the main surface and may have different + * tiling. This is used for sampling using a tiling that isn't supported + * for other operations. + */ + struct anv_surface shadow_surface; + + /** + * The base aux usage for this image. For color images, this can be + * either CCS_E or CCS_D depending on whether or not we can reliably + * leave CCS on all the time. + */ + enum isl_aux_usage aux_usage; + + struct anv_surface aux_surface; + + /** Location of the fast clear state. */ + struct anv_image_memory_range fast_clear_memory_range; + + /** + * Whether this image can be fast cleared with non-zero clear colors. + * This can happen with mutable images when formats of different bit + * sizes per components are used. + * + * On Gfx9+, because the clear colors are stored as a 4 components 32bit + * values, we can clear in R16G16_UNORM (store 2 16bit values in the + * components 0 & 1 of the clear color) and then draw in R32_UINT which + * would interpret the clear color as a single component value, using + * only the first 16bit component of the previous written clear color. + * + * On Gfx7/7.5/8, only CC_ZERO/CC_ONE clear colors are supported, this + * boolean will prevent the usage of CC_ONE. + */ + bool can_non_zero_fast_clear; + } planes[3]; +}; + +static inline bool +anv_image_is_externally_shared(const struct anv_image *image) +{ + return image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID || + image->vk.external_handle_types != 0; +} + +static inline bool +anv_image_has_private_binding(const struct anv_image *image) +{ + const struct anv_image_binding private_binding = + image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE]; + return private_binding.memory_range.size != 0; +} + +/* The ordering of this enum is important */ +enum anv_fast_clear_type { + /** Image does not have/support any fast-clear blocks */ + ANV_FAST_CLEAR_NONE = 0, + /** Image has/supports fast-clear but only to the default value */ + ANV_FAST_CLEAR_DEFAULT_VALUE = 1, + /** Image has/supports fast-clear with an arbitrary fast-clear value */ + ANV_FAST_CLEAR_ANY = 2, +}; + +/** + * Return the aspect's _format_ plane, not its _memory_ plane (using the + * vocabulary of VK_EXT_image_drm_format_modifier). As a consequence, \a + * aspect_mask may contain VK_IMAGE_ASPECT_PLANE_*, but must not contain + * VK_IMAGE_ASPECT_MEMORY_PLANE_* . + */ +static inline uint32_t +anv_image_aspect_to_plane(const struct anv_image *image, + VkImageAspectFlagBits aspect) +{ + return anv_aspect_to_plane(image->vk.aspects, aspect); +} + +/* Returns the number of auxiliary buffer levels attached to an image. */ +static inline uint8_t +anv_image_aux_levels(const struct anv_image * const image, + VkImageAspectFlagBits aspect) +{ + uint32_t plane = anv_image_aspect_to_plane(image, aspect); + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return 0; + + return image->vk.mip_levels; +} + +/* Returns the number of auxiliary buffer layers attached to an image. */ +static inline uint32_t +anv_image_aux_layers(const struct anv_image * const image, + VkImageAspectFlagBits aspect, + const uint8_t miplevel) +{ + assert(image); + + /* The miplevel must exist in the main buffer. */ + assert(miplevel < image->vk.mip_levels); + + if (miplevel >= anv_image_aux_levels(image, aspect)) { + /* There are no layers with auxiliary data because the miplevel has no + * auxiliary data. + */ + return 0; + } + + return MAX2(image->vk.array_layers, image->vk.extent.depth >> miplevel); +} + +static inline struct anv_address MUST_CHECK +anv_image_address(const struct anv_image *image, + const struct anv_image_memory_range *mem_range) +{ + const struct anv_image_binding *binding = &image->bindings[mem_range->binding]; + assert(binding->memory_range.offset == 0); + + if (mem_range->size == 0) + return ANV_NULL_ADDRESS; + + return anv_address_add(binding->address, mem_range->offset); +} + +static inline struct anv_address +anv_image_get_clear_color_addr(UNUSED const struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlagBits aspect) +{ + assert(image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV | + VK_IMAGE_ASPECT_DEPTH_BIT)); + + uint32_t plane = anv_image_aspect_to_plane(image, aspect); + const struct anv_image_memory_range *mem_range = + &image->planes[plane].fast_clear_memory_range; + + return anv_image_address(image, mem_range); +} + +static inline struct anv_address +anv_image_get_fast_clear_type_addr(const struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlagBits aspect) +{ + struct anv_address addr = + anv_image_get_clear_color_addr(device, image, aspect); + + const unsigned clear_color_state_size = device->info->ver >= 10 ? + device->isl_dev.ss.clear_color_state_size : + device->isl_dev.ss.clear_value_size; + return anv_address_add(addr, clear_color_state_size); +} + +static inline struct anv_address +anv_image_get_compression_state_addr(const struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, uint32_t array_layer) +{ + assert(level < anv_image_aux_levels(image, aspect)); + assert(array_layer < anv_image_aux_layers(image, aspect, level)); + UNUSED uint32_t plane = anv_image_aspect_to_plane(image, aspect); + assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E); + + /* Relative to start of the plane's fast clear memory range */ + uint32_t offset; + + offset = 4; /* Go past the fast clear type */ + + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + for (uint32_t l = 0; l < level; l++) + offset += anv_minify(image->vk.extent.depth, l) * 4; + } else { + offset += level * image->vk.array_layers * 4; + } + + offset += array_layer * 4; + + assert(offset < image->planes[plane].fast_clear_memory_range.size); + + return anv_address_add( + anv_image_get_fast_clear_type_addr(device, image, aspect), + offset); +} + +/* Returns true if a HiZ-enabled depth buffer can be sampled from. */ +static inline bool +anv_can_sample_with_hiz(const struct intel_device_info * const devinfo, + const struct anv_image *image) +{ + if (!(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) + return false; + + /* For Gfx8-11, there are some restrictions around sampling from HiZ. + * The Skylake PRM docs for RENDER_SURFACE_STATE::AuxiliarySurfaceMode + * say: + * + * "If this field is set to AUX_HIZ, Number of Multisamples must + * be MULTISAMPLECOUNT_1, and Surface Type cannot be SURFTYPE_3D." + */ + if (image->vk.image_type == VK_IMAGE_TYPE_3D) + return false; + + /* Allow this feature on BDW even though it is disabled in the BDW devinfo + * struct. There's documentation which suggests that this feature actually + * reduces performance on BDW, but it has only been observed to help so + * far. Sampling fast-cleared blocks on BDW must also be handled with care + * (see depth_stencil_attachment_compute_aux_usage() for more info). + */ + if (devinfo->ver != 8 && !devinfo->has_sample_with_hiz) + return false; + + return image->vk.samples == 1; +} + +/* Returns true if an MCS-enabled buffer can be sampled from. */ +static inline bool +anv_can_sample_mcs_with_clear(const struct intel_device_info * const devinfo, + const struct anv_image *image) +{ + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_COLOR_BIT); + + assert(isl_aux_usage_has_mcs(image->planes[plane].aux_usage)); + + const struct anv_surface *anv_surf = &image->planes[plane].primary_surface; + + /* On TGL, the sampler has an issue with some 8 and 16bpp MSAA fast clears. + * See HSD 1707282275, wa_14013111325. Due to the use of + * format-reinterpretation, a simplified workaround is implemented. + */ + if (devinfo->ver >= 12 && + isl_format_get_layout(anv_surf->isl.format)->bpb <= 16) { + return false; + } + + return true; +} + +static inline bool +anv_image_plane_uses_aux_map(const struct anv_device *device, + const struct anv_image *image, + uint32_t plane) +{ + return device->info->has_aux_map && + isl_aux_usage_has_ccs(image->planes[plane].aux_usage); +} + +void +anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + uint32_t level, + uint32_t base_layer, + uint32_t layer_count); + +void +anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + enum isl_format format, struct isl_swizzle swizzle, + uint32_t level, uint32_t base_layer, uint32_t layer_count, + VkRect2D area, union isl_color_value clear_color); +void +anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlags aspects, + enum isl_aux_usage depth_aux_usage, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + VkRect2D area, + float depth_value, uint8_t stencil_value); +void +anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *src_image, + enum isl_aux_usage src_aux_usage, + uint32_t src_level, uint32_t src_base_layer, + const struct anv_image *dst_image, + enum isl_aux_usage dst_aux_usage, + uint32_t dst_level, uint32_t dst_base_layer, + VkImageAspectFlagBits aspect, + uint32_t src_x, uint32_t src_y, + uint32_t dst_x, uint32_t dst_y, + uint32_t width, uint32_t height, + uint32_t layer_count, + enum blorp_filter filter); +void +anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, uint32_t level, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op hiz_op); +void +anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlags aspects, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + VkRect2D area, uint8_t stencil_value); +void +anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op mcs_op, union isl_color_value *clear_value, + bool predicate); +void +anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, uint32_t level, + uint32_t base_layer, uint32_t layer_count, + enum isl_aux_op ccs_op, union isl_color_value *clear_value, + bool predicate); + +void +anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count); + +enum isl_aux_state ATTRIBUTE_PURE +anv_layout_to_aux_state(const struct intel_device_info * const devinfo, + const struct anv_image *image, + const VkImageAspectFlagBits aspect, + const VkImageLayout layout); + +enum isl_aux_usage ATTRIBUTE_PURE +anv_layout_to_aux_usage(const struct intel_device_info * const devinfo, + const struct anv_image *image, + const VkImageAspectFlagBits aspect, + const VkImageUsageFlagBits usage, + const VkImageLayout layout); + +enum anv_fast_clear_type ATTRIBUTE_PURE +anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo, + const struct anv_image * const image, + const VkImageAspectFlagBits aspect, + const VkImageLayout layout); + +static inline bool +anv_image_aspects_compatible(VkImageAspectFlags aspects1, + VkImageAspectFlags aspects2) +{ + if (aspects1 == aspects2) + return true; + + /* Only 1 color aspects are compatibles. */ + if ((aspects1 & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) != 0 && + (aspects2 & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) != 0 && + util_bitcount(aspects1) == util_bitcount(aspects2)) + return true; + + return false; +} + +struct anv_image_view { + struct vk_image_view vk; + + const struct anv_image *image; /**< VkImageViewCreateInfo::image */ + + unsigned n_planes; + struct { + uint32_t image_plane; + + struct isl_view isl; + + /** + * RENDER_SURFACE_STATE when using image as a sampler surface with an + * image layout of SHADER_READ_ONLY_OPTIMAL or + * DEPTH_STENCIL_READ_ONLY_OPTIMAL. + */ + struct anv_surface_state optimal_sampler_surface_state; + + /** + * RENDER_SURFACE_STATE when using image as a sampler surface with an + * image layout of GENERAL. + */ + struct anv_surface_state general_sampler_surface_state; + + /** + * RENDER_SURFACE_STATE when using image as a storage image. Separate + * states for vanilla (with the original format) and one which has been + * lowered to a format suitable for reading. This may be a raw surface + * in extreme cases or simply a surface with a different format where we + * expect some conversion to be done in the shader. + */ + struct anv_surface_state storage_surface_state; + struct anv_surface_state lowered_storage_surface_state; + + struct brw_image_param lowered_storage_image_param; + } planes[3]; +}; + +enum anv_image_view_state_flags { + ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED = (1 << 0), + ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL = (1 << 1), +}; + +void anv_image_fill_surface_state(struct anv_device *device, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + const struct isl_view *view, + isl_surf_usage_flags_t view_usage, + enum isl_aux_usage aux_usage, + const union isl_color_value *clear_color, + enum anv_image_view_state_flags flags, + struct anv_surface_state *state_inout, + struct brw_image_param *image_param_out); + +struct anv_image_create_info { + const VkImageCreateInfo *vk_info; + + /** An opt-in bitmask which filters an ISL-mapping of the Vulkan tiling. */ + isl_tiling_flags_t isl_tiling_flags; + + /** These flags will be added to any derived from VkImageCreateInfo. */ + isl_surf_usage_flags_t isl_extra_usage_flags; +}; + +VkResult anv_image_init(struct anv_device *device, struct anv_image *image, + const struct anv_image_create_info *create_info); + +void anv_image_finish(struct anv_image *image); + +void anv_image_get_memory_requirements(struct anv_device *device, + struct anv_image *image, + VkImageAspectFlags aspects, + VkMemoryRequirements2 *pMemoryRequirements); + +enum isl_format +anv_isl_format_for_descriptor_type(const struct anv_device *device, + VkDescriptorType type); + +static inline uint32_t +anv_rasterization_aa_mode(VkPolygonMode raster_mode, + VkLineRasterizationModeEXT line_mode) +{ + if (raster_mode == VK_POLYGON_MODE_LINE && + line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT) + return true; + return false; +} + +VkFormatFeatureFlags2 +anv_get_image_format_features2(const struct intel_device_info *devinfo, + VkFormat vk_format, + const struct anv_format *anv_format, + VkImageTiling vk_tiling, + const struct isl_drm_modifier_info *isl_mod_info); + +void anv_fill_buffer_surface_state(struct anv_device *device, + struct anv_state state, + enum isl_format format, + struct isl_swizzle swizzle, + isl_surf_usage_flags_t usage, + struct anv_address address, + uint32_t range, uint32_t stride); + + +/* Haswell border color is a bit of a disaster. Float and unorm formats use a + * straightforward 32-bit float color in the first 64 bytes. Instead of using + * a nice float/integer union like Gfx8+, Haswell specifies the integer border + * color as a separate entry /after/ the float color. The layout of this entry + * also depends on the format's bpp (with extra hacks for RG32), and overlaps. + * + * Since we don't know the format/bpp, we can't make any of the border colors + * containing '1' work for all formats, as it would be in the wrong place for + * some of them. We opt to make 32-bit integers work as this seems like the + * most common option. Fortunately, transparent black works regardless, as + * all zeroes is the same in every bit-size. + */ +struct hsw_border_color { + float float32[4]; + uint32_t _pad0[12]; + uint32_t uint32[4]; + uint32_t _pad1[108]; +}; + +struct gfx8_border_color { + union { + float float32[4]; + uint32_t uint32[4]; + }; + /* Pad out to 64 bytes */ + uint32_t _pad[12]; +}; + +struct anv_ycbcr_conversion { + struct vk_object_base base; + + const struct anv_format * format; + VkSamplerYcbcrModelConversion ycbcr_model; + VkSamplerYcbcrRange ycbcr_range; + VkComponentSwizzle mapping[4]; + VkChromaLocation chroma_offsets[2]; + VkFilter chroma_filter; + bool chroma_reconstruction; +}; + +struct anv_sampler { + struct vk_object_base base; + + uint32_t state[3][4]; + uint32_t n_planes; + struct anv_ycbcr_conversion *conversion; + + /* Blob of sampler state data which is guaranteed to be 32-byte aligned + * and with a 32-byte stride for use as bindless samplers. + */ + struct anv_state bindless_state; + + struct anv_state custom_border_color; +}; + +#define ANV_PIPELINE_STATISTICS_MASK 0x000007ff + +struct anv_query_pool { + struct vk_object_base base; + + VkQueryType type; + VkQueryPipelineStatisticFlags pipeline_statistics; + /** Stride between slots, in bytes */ + uint32_t stride; + /** Number of slots in this query pool */ + uint32_t slots; + struct anv_bo * bo; + + /* KHR perf queries : */ + uint32_t pass_size; + uint32_t data_offset; + uint32_t snapshot_size; + uint32_t n_counters; + struct intel_perf_counter_pass *counter_pass; + uint32_t n_passes; + struct intel_perf_query_info **pass_query; +}; + +static inline uint32_t khr_perf_query_preamble_offset(const struct anv_query_pool *pool, + uint32_t pass) +{ + return pool->pass_size * pass + 8; +} + +struct anv_acceleration_structure { + struct vk_object_base base; + + VkDeviceSize size; + struct anv_address address; +}; + +void +anv_dump_pipe_bits(enum anv_pipe_bits bits); + +static inline void +anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer, + enum anv_pipe_bits bits, + const char* reason) +{ + cmd_buffer->state.pending_pipe_bits |= bits; + if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) + { + fputs("pc: add ", stderr); + anv_dump_pipe_bits(bits); + fprintf(stderr, "reason: %s\n", reason); + } +} + +struct anv_performance_configuration_intel { + struct vk_object_base base; + + struct intel_perf_registers *register_config; + + uint64_t config_id; +}; + +void anv_physical_device_init_perf(struct anv_physical_device *device, int fd); +void anv_device_perf_init(struct anv_device *device); +void anv_perf_write_pass_results(struct intel_perf_config *perf, + struct anv_query_pool *pool, uint32_t pass, + const struct intel_perf_query_result *accumulated_results, + union VkPerformanceCounterResultKHR *results); + +/* Use to emit a series of memcpy operations */ +struct anv_memcpy_state { + struct anv_device *device; + struct anv_batch *batch; + + struct anv_vb_cache_range vb_bound; + struct anv_vb_cache_range vb_dirty; +}; + +struct anv_utrace_flush_copy { + /* Needs to be the first field */ + struct intel_ds_flush_data ds; + + /* Batch stuff to implement of copy of timestamps recorded in another + * buffer. + */ + struct anv_reloc_list relocs; + struct anv_batch batch; + struct anv_bo *batch_bo; + + /* Buffer of 64bits timestamps */ + struct anv_bo *trace_bo; + + /* Syncobj to be signaled when the batch completes */ + struct vk_sync *sync; + + /* Queue on which all the recorded traces are submitted */ + struct anv_queue *queue; + + struct anv_memcpy_state memcpy_state; +}; + +void anv_device_utrace_init(struct anv_device *device); +void anv_device_utrace_finish(struct anv_device *device); +VkResult +anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, + uint32_t cmd_buffer_count, + struct anv_cmd_buffer **cmd_buffers, + struct anv_utrace_flush_copy **out_flush_data); + +#ifdef HAVE_PERFETTO +void anv_perfetto_init(void); +uint64_t anv_perfetto_begin_submit(struct anv_queue *queue); +void anv_perfetto_end_submit(struct anv_queue *queue, uint32_t submission_id, + uint64_t start_ts); +#else +static inline void anv_perfetto_init(void) +{ +} +static inline uint64_t anv_perfetto_begin_submit(struct anv_queue *queue) +{ + return 0; +} +static inline void anv_perfetto_end_submit(struct anv_queue *queue, + uint32_t submission_id, + uint64_t start_ts) +{} +#endif + + +#define ANV_FROM_HANDLE(__anv_type, __name, __handle) \ + VK_FROM_HANDLE(__anv_type, __name, __handle) + +VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, vk.base, VkCommandBuffer, + VK_OBJECT_TYPE_COMMAND_BUFFER) +VK_DEFINE_HANDLE_CASTS(anv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) +VK_DEFINE_HANDLE_CASTS(anv_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE) +VK_DEFINE_HANDLE_CASTS(anv_physical_device, vk.base, VkPhysicalDevice, + VK_OBJECT_TYPE_PHYSICAL_DEVICE) +VK_DEFINE_HANDLE_CASTS(anv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) + +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_acceleration_structure, base, + VkAccelerationStructureKHR, + VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, vk.base, VkBuffer, + VK_OBJECT_TYPE_BUFFER) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, base, VkBufferView, + VK_OBJECT_TYPE_BUFFER_VIEW) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_pool, base, VkDescriptorPool, + VK_OBJECT_TYPE_DESCRIPTOR_POOL) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set, base, VkDescriptorSet, + VK_OBJECT_TYPE_DESCRIPTOR_SET) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set_layout, base, + VkDescriptorSetLayout, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_update_template, base, + VkDescriptorUpdateTemplate, + VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, base, VkDeviceMemory, + VK_OBJECT_TYPE_DEVICE_MEMORY) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image_view, vk.base, VkImageView, + VK_OBJECT_TYPE_IMAGE_VIEW); +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, base, VkPipeline, + VK_OBJECT_TYPE_PIPELINE) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, base, VkPipelineLayout, + VK_OBJECT_TYPE_PIPELINE_LAYOUT) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, base, VkQueryPool, + VK_OBJECT_TYPE_QUERY_POOL) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, base, VkSampler, + VK_OBJECT_TYPE_SAMPLER) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_ycbcr_conversion, base, + VkSamplerYcbcrConversion, + VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION) +VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base, + VkPerformanceConfigurationINTEL, + VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL) + +#define anv_genX(devinfo, thing) ({ \ + __typeof(&gfx9_##thing) genX_thing; \ + switch ((devinfo)->verx10) { \ + case 70: \ + genX_thing = &gfx7_##thing; \ + break; \ + case 75: \ + genX_thing = &gfx75_##thing; \ + break; \ + case 80: \ + genX_thing = &gfx8_##thing; \ + break; \ + case 90: \ + genX_thing = &gfx9_##thing; \ + break; \ + case 110: \ + genX_thing = &gfx11_##thing; \ + break; \ + case 120: \ + genX_thing = &gfx12_##thing; \ + break; \ + case 125: \ + genX_thing = &gfx125_##thing; \ + break; \ + default: \ + unreachable("Unknown hardware generation"); \ + } \ + genX_thing; \ +}) + +/* Gen-specific function declarations */ +#ifdef genX +# include "anv_genX.h" +#else +# define genX(x) gfx7_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx75_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx8_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx9_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx11_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx12_##x +# include "anv_genX.h" +# undef genX +# define genX(x) gfx125_##x +# include "anv_genX.h" +# undef genX +#endif + +#endif /* ANV_PRIVATE_H */ diff --git a/src/intel/vulkan_hasvk/anv_queue.c b/src/intel/vulkan_hasvk/anv_queue.c new file mode 100644 index 00000000000..2cada846753 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_queue.c @@ -0,0 +1,75 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * This file implements VkQueue + */ + +#include "anv_private.h" + +VkResult +anv_queue_init(struct anv_device *device, struct anv_queue *queue, + uint32_t exec_flags, + const VkDeviceQueueCreateInfo *pCreateInfo, + uint32_t index_in_family) +{ + struct anv_physical_device *pdevice = device->physical; + VkResult result; + + result = vk_queue_init(&queue->vk, &device->vk, pCreateInfo, + index_in_family); + if (result != VK_SUCCESS) + return result; + + if (INTEL_DEBUG(DEBUG_SYNC)) { + result = vk_sync_create(&device->vk, + &device->physical->sync_syncobj_type, + 0, 0, &queue->sync); + if (result != VK_SUCCESS) { + vk_queue_finish(&queue->vk); + return result; + } + } + + queue->vk.driver_submit = anv_queue_submit; + + queue->device = device; + + assert(queue->vk.queue_family_index < pdevice->queue.family_count); + queue->family = &pdevice->queue.families[queue->vk.queue_family_index]; + + queue->index_in_family = index_in_family; + + queue->exec_flags = exec_flags; + + return VK_SUCCESS; +} + +void +anv_queue_finish(struct anv_queue *queue) +{ + if (queue->sync) + vk_sync_destroy(&queue->device->vk, queue->sync); + + vk_queue_finish(&queue->vk); +} diff --git a/src/intel/vulkan_hasvk/anv_util.c b/src/intel/vulkan_hasvk/anv_util.c new file mode 100644 index 00000000000..988010232fe --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_util.c @@ -0,0 +1,92 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "anv_private.h" +#include "vk_enum_to_str.h" + +void +__anv_perf_warn(struct anv_device *device, + const struct vk_object_base *object, + const char *file, int line, const char *format, ...) +{ + va_list ap; + char buffer[256]; + + va_start(ap, format); + vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + if (object) { + __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT, + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + VK_LOG_OBJS(object), file, line, + "PERF: %s", buffer); + } else { + __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT, + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + VK_LOG_NO_OBJS(device->physical->instance), file, line, + "PERF: %s", buffer); + } +} + +void +anv_dump_pipe_bits(enum anv_pipe_bits bits) +{ + if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT) + fputs("+depth_flush ", stderr); + if (bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT) + fputs("+dc_flush ", stderr); + if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT) + fputs("+hdc_flush ", stderr); + if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + fputs("+rt_flush ", stderr); + if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT) + fputs("+tile_flush ", stderr); + if (bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT) + fputs("+state_inval ", stderr); + if (bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT) + fputs("+const_inval ", stderr); + if (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT) + fputs("+vf_inval ", stderr); + if (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT) + fputs("+tex_inval ", stderr); + if (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT) + fputs("+ic_inval ", stderr); + if (bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT) + fputs("+pb_stall ", stderr); + if (bits & ANV_PIPE_PSS_STALL_SYNC_BIT) + fputs("+pss_stall ", stderr); + if (bits & ANV_PIPE_DEPTH_STALL_BIT) + fputs("+depth_stall ", stderr); + if (bits & ANV_PIPE_CS_STALL_BIT) + fputs("+cs_stall ", stderr); + if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) + fputs("+utdp_flush", stderr); +} diff --git a/src/intel/vulkan_hasvk/anv_utrace.c b/src/intel/vulkan_hasvk/anv_utrace.c new file mode 100644 index 00000000000..965be744411 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_utrace.c @@ -0,0 +1,346 @@ +/* + * Copyright © 2021 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include "perf/intel_perf.h" + +static uint32_t +command_buffers_count_utraces(struct anv_device *device, + uint32_t cmd_buffer_count, + struct anv_cmd_buffer **cmd_buffers, + uint32_t *utrace_copies) +{ + if (!u_trace_context_actively_tracing(&device->ds.trace_context)) + return 0; + + uint32_t utraces = 0; + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + if (u_trace_has_points(&cmd_buffers[i]->trace)) { + utraces++; + if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) + *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks); + } + } + + return utraces; +} + +static void +anv_utrace_delete_flush_data(struct u_trace_context *utctx, + void *flush_data) +{ + struct anv_device *device = + container_of(utctx, struct anv_device, ds.trace_context); + struct anv_utrace_flush_copy *flush = flush_data; + + intel_ds_flush_data_fini(&flush->ds); + + if (flush->trace_bo) { + assert(flush->batch_bo); + anv_reloc_list_finish(&flush->relocs, &device->vk.alloc); + anv_device_release_bo(device, flush->batch_bo); + anv_device_release_bo(device, flush->trace_bo); + } + + vk_sync_destroy(&device->vk, flush->sync); + + vk_free(&device->vk.alloc, flush); +} + +static void +anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx, + void *cmdstream, + void *ts_from, uint32_t from_offset, + void *ts_to, uint32_t to_offset, + uint32_t count) +{ + struct anv_device *device = + container_of(utctx, struct anv_device, ds.trace_context); + struct anv_utrace_flush_copy *flush = cmdstream; + struct anv_address from_addr = (struct anv_address) { + .bo = ts_from, .offset = from_offset * sizeof(uint64_t) }; + struct anv_address to_addr = (struct anv_address) { + .bo = ts_to, .offset = to_offset * sizeof(uint64_t) }; + + anv_genX(device->info, emit_so_memcpy)(&flush->memcpy_state, + to_addr, from_addr, count * sizeof(uint64_t)); +} + +VkResult +anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, + uint32_t cmd_buffer_count, + struct anv_cmd_buffer **cmd_buffers, + struct anv_utrace_flush_copy **out_flush_data) +{ + struct anv_device *device = queue->device; + uint32_t utrace_copies = 0; + uint32_t utraces = command_buffers_count_utraces(device, + cmd_buffer_count, + cmd_buffers, + &utrace_copies); + if (!utraces) { + *out_flush_data = NULL; + return VK_SUCCESS; + } + + VkResult result; + struct anv_utrace_flush_copy *flush = + vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_flush_copy), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!flush) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + intel_ds_flush_data_init(&flush->ds, queue->ds, queue->ds->submission_id); + + result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type, + 0, 0, &flush->sync); + if (result != VK_SUCCESS) + goto error_sync; + + if (utrace_copies > 0) { + result = anv_bo_pool_alloc(&device->utrace_bo_pool, + utrace_copies * 4096, + &flush->trace_bo); + if (result != VK_SUCCESS) + goto error_trace_buf; + + result = anv_bo_pool_alloc(&device->utrace_bo_pool, + /* 128 dwords of setup + 64 dwords per copy */ + align_u32(512 + 64 * utrace_copies, 4096), + &flush->batch_bo); + if (result != VK_SUCCESS) + goto error_batch_buf; + + result = anv_reloc_list_init(&flush->relocs, &device->vk.alloc); + if (result != VK_SUCCESS) + goto error_reloc_list; + + flush->batch.alloc = &device->vk.alloc; + flush->batch.relocs = &flush->relocs; + anv_batch_set_storage(&flush->batch, + (struct anv_address) { .bo = flush->batch_bo, }, + flush->batch_bo->map, flush->batch_bo->size); + + /* Emit the copies */ + anv_genX(device->info, emit_so_memcpy_init)(&flush->memcpy_state, + device, + &flush->batch); + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) { + u_trace_flush(&cmd_buffers[i]->trace, flush, false); + } else { + u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace), + u_trace_end_iterator(&cmd_buffers[i]->trace), + &flush->ds.trace, + flush, + anv_device_utrace_emit_copy_ts_buffer); + } + } + anv_genX(device->info, emit_so_memcpy_fini)(&flush->memcpy_state); + + u_trace_flush(&flush->ds.trace, flush, true); + + if (flush->batch.status != VK_SUCCESS) { + result = flush->batch.status; + goto error_batch; + } + } else { + for (uint32_t i = 0; i < cmd_buffer_count; i++) { + assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT); + u_trace_flush(&cmd_buffers[i]->trace, flush, i == (cmd_buffer_count - 1)); + } + } + + flush->queue = queue; + + *out_flush_data = flush; + + return VK_SUCCESS; + + error_batch: + anv_reloc_list_finish(&flush->relocs, &device->vk.alloc); + error_reloc_list: + anv_bo_pool_free(&device->utrace_bo_pool, flush->batch_bo); + error_batch_buf: + anv_bo_pool_free(&device->utrace_bo_pool, flush->trace_bo); + error_trace_buf: + vk_sync_destroy(&device->vk, flush->sync); + error_sync: + vk_free(&device->vk.alloc, flush); + return result; +} + +static void * +anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b) +{ + struct anv_device *device = + container_of(utctx, struct anv_device, ds.trace_context); + + struct anv_bo *bo = NULL; + UNUSED VkResult result = + anv_bo_pool_alloc(&device->utrace_bo_pool, + align_u32(size_b, 4096), + &bo); + assert(result == VK_SUCCESS); + + return bo; +} + +static void +anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps) +{ + struct anv_device *device = + container_of(utctx, struct anv_device, ds.trace_context); + struct anv_bo *bo = timestamps; + + anv_bo_pool_free(&device->utrace_bo_pool, bo); +} + +static void +anv_utrace_record_ts(struct u_trace *ut, void *cs, + void *timestamps, unsigned idx, + bool end_of_pipe) +{ + struct anv_cmd_buffer *cmd_buffer = + container_of(ut, struct anv_cmd_buffer, trace); + struct anv_device *device = cmd_buffer->device; + struct anv_bo *bo = timestamps; + + device->physical->cmd_emit_timestamp(&cmd_buffer->batch, device, + (struct anv_address) { + .bo = bo, + .offset = idx * sizeof(uint64_t) }, + end_of_pipe); +} + +static uint64_t +anv_utrace_read_ts(struct u_trace_context *utctx, + void *timestamps, unsigned idx, void *flush_data) +{ + struct anv_device *device = + container_of(utctx, struct anv_device, ds.trace_context); + struct anv_bo *bo = timestamps; + struct anv_utrace_flush_copy *flush = flush_data; + + /* Only need to stall on results for the first entry: */ + if (idx == 0) { + UNUSED VkResult result = + vk_sync_wait(&device->vk, + flush->sync, + 0, + VK_SYNC_WAIT_COMPLETE, + os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE)); + assert(result == VK_SUCCESS); + } + + uint64_t *ts = bo->map; + + /* Don't translate the no-timestamp marker: */ + if (ts[idx] == U_TRACE_NO_TIMESTAMP) + return U_TRACE_NO_TIMESTAMP; + + return intel_device_info_timebase_scale(device->info, ts[idx]); +} + +static const char * +queue_family_to_name(const struct anv_queue_family *family) +{ + switch (family->engine_class) { + case I915_ENGINE_CLASS_RENDER: + return "render"; + case I915_ENGINE_CLASS_COPY: + return "copy"; + case I915_ENGINE_CLASS_VIDEO: + return "video"; + case I915_ENGINE_CLASS_VIDEO_ENHANCE: + return "video-enh"; + default: + return "unknown"; + } +} + +void +anv_device_utrace_init(struct anv_device *device) +{ + anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace"); + intel_ds_device_init(&device->ds, device->info, device->fd, + device->physical->local_minor - 128, + INTEL_DS_API_VULKAN); + u_trace_context_init(&device->ds.trace_context, + &device->ds, + anv_utrace_create_ts_buffer, + anv_utrace_destroy_ts_buffer, + anv_utrace_record_ts, + anv_utrace_read_ts, + anv_utrace_delete_flush_data); + + for (uint32_t q = 0; q < device->queue_count; q++) { + struct anv_queue *queue = &device->queues[q]; + + queue->ds = + intel_ds_device_add_queue(&device->ds, "%s%u", + queue_family_to_name(queue->family), + queue->index_in_family); + } +} + +void +anv_device_utrace_finish(struct anv_device *device) +{ + u_trace_context_process(&device->ds.trace_context, true); + intel_ds_device_fini(&device->ds); + anv_bo_pool_finish(&device->utrace_bo_pool); +} + +enum intel_ds_stall_flag +anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits) +{ + static const struct { + enum anv_pipe_bits anv; + enum intel_ds_stall_flag ds; + } anv_to_ds_flags[] = { + { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, }, + { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT, .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, }, + { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT, .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, }, + { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, }, + { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, }, + { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, }, + { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, }, + { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, }, + { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, }, + { .anv = ANV_PIPE_DEPTH_STALL_BIT, .ds = INTEL_DS_DEPTH_STALL_BIT, }, + { .anv = ANV_PIPE_CS_STALL_BIT, .ds = INTEL_DS_CS_STALL_BIT, }, + { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT, .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, }, + { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT, .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, }, + { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, }, + }; + + enum intel_ds_stall_flag ret = 0; + for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) { + if (anv_to_ds_flags[i].anv & bits) + ret |= anv_to_ds_flags[i].ds; + } + + return ret; +} diff --git a/src/intel/vulkan_hasvk/anv_wsi.c b/src/intel/vulkan_hasvk/anv_wsi.c new file mode 100644 index 00000000000..5e98673e275 --- /dev/null +++ b/src/intel/vulkan_hasvk/anv_wsi.c @@ -0,0 +1,118 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" +#include "anv_measure.h" +#include "wsi_common.h" +#include "vk_fence.h" +#include "vk_queue.h" +#include "vk_semaphore.h" +#include "vk_util.h" + +static PFN_vkVoidFunction +anv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName); +} + +VkResult +anv_init_wsi(struct anv_physical_device *physical_device) +{ + VkResult result; + + result = wsi_device_init(&physical_device->wsi_device, + anv_physical_device_to_handle(physical_device), + anv_wsi_proc_addr, + &physical_device->instance->vk.alloc, + physical_device->master_fd, + &physical_device->instance->dri_options, + false); + if (result != VK_SUCCESS) + return result; + + physical_device->wsi_device.supports_modifiers = true; + physical_device->wsi_device.signal_semaphore_with_memory = true; + physical_device->wsi_device.signal_fence_with_memory = true; + + physical_device->vk.wsi_device = &physical_device->wsi_device; + + wsi_device_setup_syncobj_fd(&physical_device->wsi_device, + physical_device->local_fd); + + return VK_SUCCESS; +} + +void +anv_finish_wsi(struct anv_physical_device *physical_device) +{ + physical_device->vk.wsi_device = NULL; + wsi_device_finish(&physical_device->wsi_device, + &physical_device->instance->vk.alloc); +} + +VkResult anv_AcquireNextImage2KHR( + VkDevice _device, + const VkAcquireNextImageInfoKHR *pAcquireInfo, + uint32_t *pImageIndex) +{ + VK_FROM_HANDLE(anv_device, device, _device); + + VkResult result = + wsi_common_acquire_next_image2(&device->physical->wsi_device, + _device, pAcquireInfo, pImageIndex); + if (result == VK_SUCCESS) + anv_measure_acquire(device); + + return result; +} + +VkResult anv_QueuePresentKHR( + VkQueue _queue, + const VkPresentInfoKHR* pPresentInfo) +{ + ANV_FROM_HANDLE(anv_queue, queue, _queue); + struct anv_device *device = queue->device; + VkResult result; + + if (device->debug_frame_desc) { + device->debug_frame_desc->frame_id++; + if (device->physical->memory.need_clflush) { + intel_clflush_range(device->debug_frame_desc, + sizeof(*device->debug_frame_desc)); + } + } + + result = vk_queue_wait_before_present(&queue->vk, pPresentInfo); + if (result != VK_SUCCESS) + return result; + + result = wsi_common_queue_present(&device->physical->wsi_device, + anv_device_to_handle(queue->device), + _queue, 0, + pPresentInfo); + + u_trace_context_process(&device->ds.trace_context, true); + + return result; +} diff --git a/src/intel/vulkan_hasvk/genX_blorp_exec.c b/src/intel/vulkan_hasvk/genX_blorp_exec.c new file mode 100644 index 00000000000..40582ab9391 --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_blorp_exec.c @@ -0,0 +1,410 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "anv_private.h" +#include "anv_measure.h" + +/* These are defined in anv_private.h and blorp_genX_exec.h */ +#undef __gen_address_type +#undef __gen_user_data +#undef __gen_combine_address + +#include "common/intel_l3_config.h" +#include "blorp/blorp_genX_exec.h" + +#include "ds/intel_tracepoints.h" + +static void blorp_measure_start(struct blorp_batch *_batch, + const struct blorp_params *params) +{ + struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch; + trace_intel_begin_blorp(&cmd_buffer->trace); + anv_measure_snapshot(cmd_buffer, + params->snapshot_type, + NULL, 0); +} + +static void blorp_measure_end(struct blorp_batch *_batch, + const struct blorp_params *params) +{ + struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch; + trace_intel_end_blorp(&cmd_buffer->trace, + params->x1 - params->x0, + params->y1 - params->y0, + params->hiz_op, + params->fast_clear_op, + params->shader_type, + params->shader_pipeline); +} + +static void * +blorp_emit_dwords(struct blorp_batch *batch, unsigned n) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + return anv_batch_emit_dwords(&cmd_buffer->batch, n); +} + +static uint64_t +blorp_emit_reloc(struct blorp_batch *batch, + void *location, struct blorp_address address, uint32_t delta) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + assert(cmd_buffer->batch.start <= location && + location < cmd_buffer->batch.end); + return anv_batch_emit_reloc(&cmd_buffer->batch, location, + address.buffer, address.offset + delta); +} + +static void +blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, + struct blorp_address address, uint32_t delta) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + VkResult result; + + if (ANV_ALWAYS_SOFTPIN) { + result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + address.buffer); + if (unlikely(result != VK_SUCCESS)) + anv_batch_set_error(&cmd_buffer->batch, result); + return; + } + + uint64_t address_u64 = 0; + result = anv_reloc_list_add(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + ss_offset, address.buffer, + address.offset + delta, + &address_u64); + if (result != VK_SUCCESS) + anv_batch_set_error(&cmd_buffer->batch, result); + + void *dest = anv_block_pool_map( + &cmd_buffer->device->surface_state_pool.block_pool, ss_offset, 8); + write_reloc(cmd_buffer->device, dest, address_u64, false); +} + +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address address) +{ + if (ANV_ALWAYS_SOFTPIN) { + struct anv_address anv_addr = { + .bo = address.buffer, + .offset = address.offset, + }; + return anv_address_physical(anv_addr); + } else { + /* We'll let blorp_surface_reloc write the address. */ + return 0; + } +} + +#if GFX_VER >= 7 && GFX_VER < 10 +static struct blorp_address +blorp_get_surface_base_address(struct blorp_batch *batch) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + return (struct blorp_address) { + .buffer = cmd_buffer->device->surface_state_pool.block_pool.bo, + .offset = 0, + }; +} +#endif + +static void * +blorp_alloc_dynamic_state(struct blorp_batch *batch, + uint32_t size, + uint32_t alignment, + uint32_t *offset) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment); + + *offset = state.offset; + return state.map; +} + +UNUSED static void * +blorp_alloc_general_state(struct blorp_batch *batch, + uint32_t size, + uint32_t alignment, + uint32_t *offset) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + struct anv_state state = + anv_state_stream_alloc(&cmd_buffer->general_state_stream, size, + alignment); + + *offset = state.offset; + return state.map; +} + +static void +blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries, + unsigned state_size, unsigned state_alignment, + uint32_t *bt_offset, + uint32_t *surface_offsets, void **surface_maps) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + uint32_t state_offset; + struct anv_state bt_state; + + VkResult result = + anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, num_entries, + &state_offset, &bt_state); + if (result != VK_SUCCESS) + return; + + uint32_t *bt_map = bt_state.map; + *bt_offset = bt_state.offset; + + for (unsigned i = 0; i < num_entries; i++) { + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); + bt_map[i] = surface_state.offset + state_offset; + surface_offsets[i] = surface_state.offset; + surface_maps[i] = surface_state.map; + } +} + +static uint32_t +blorp_binding_table_offset_to_pointer(struct blorp_batch *batch, + uint32_t offset) +{ + return offset; +} + +static void * +blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size, + struct blorp_address *addr) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + struct anv_state vb_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64); + + *addr = (struct blorp_address) { + .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = vb_state.offset, + .mocs = isl_mocs(&cmd_buffer->device->isl_dev, + ISL_SURF_USAGE_VERTEX_BUFFER_BIT, false), + }; + + return vb_state.map; +} + +static void +blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch, + const struct blorp_address *addrs, + uint32_t *sizes, + unsigned num_vbs) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + for (unsigned i = 0; i < num_vbs; i++) { + struct anv_address anv_addr = { + .bo = addrs[i].buffer, + .offset = addrs[i].offset, + }; + genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, + i, anv_addr, sizes[i]); + } + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't + * really matter for blorp because we never call apply_pipe_flushes after + * this point. + */ + genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL, + (1 << num_vbs) - 1); +} + +UNUSED static struct blorp_address +blorp_get_workaround_address(struct blorp_batch *batch) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + return (struct blorp_address) { + .buffer = cmd_buffer->device->workaround_address.bo, + .offset = cmd_buffer->device->workaround_address.offset, + }; +} + +static void +blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) +{ + /* We don't need to flush states anymore, since everything will be snooped. + */ +} + +static const struct intel_l3_config * +blorp_get_l3_config(struct blorp_batch *batch) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + return cmd_buffer->state.current_l3_config; +} + +static void +blorp_exec_on_render(struct blorp_batch *batch, + const struct blorp_params *params) +{ + assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0); + + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT); + + const unsigned scale = params->fast_clear_op ? UINT_MAX : 1; + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0, + params->y1 - params->y0, scale); + +#if GFX_VER >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Target Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "before blorp BTI change"); +#endif + + if (params->depth.enabled && + !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)) + genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, ¶ms->depth.surf); + + genX(flush_pipeline_select_3d)(cmd_buffer); + + /* Apply any outstanding flushes in case pipeline select haven't. */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer); + + /* BLORP doesn't do anything fancy with depth such as discards, so we want + * the PMA fix off. Also, off is always the safe option. + */ + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); + + blorp_exec(batch, params); + +#if GFX_VER >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Target Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "after blorp BTI change"); +#endif + + /* Calculate state that does not get touched by blorp. + * Flush everything else. + */ + anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER | + ANV_CMD_DIRTY_XFB_ENABLE); + + BITSET_DECLARE(dyn_dirty, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX); + BITSET_ONES(dyn_dirty); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSORS); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_FSR); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS); + if (!params->wm_prog_data) { + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES); + BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP); + } + + cmd_buffer->state.gfx.vb_dirty = ~0; + cmd_buffer->state.gfx.dirty |= dirty; + BITSET_OR(cmd_buffer->vk.dynamic_graphics_state.dirty, + cmd_buffer->vk.dynamic_graphics_state.dirty, dyn_dirty); + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; +} + +static void +blorp_exec_on_compute(struct blorp_batch *batch, + const struct blorp_params *params) +{ + assert(batch->flags & BLORP_BATCH_USE_COMPUTE); + + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT); + + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + /* Apply any outstanding flushes in case pipeline select haven't. */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + blorp_exec(batch, params); + + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; +} + +void +genX(blorp_exec)(struct blorp_batch *batch, + const struct blorp_params *params) +{ + struct anv_cmd_buffer *cmd_buffer = batch->driver_batch; + + if (!cmd_buffer->state.current_l3_config) { + const struct intel_l3_config *cfg = + intel_get_default_l3_config(cmd_buffer->device->info); + genX(cmd_buffer_config_l3)(cmd_buffer, cfg); + } + +#if GFX_VER == 7 + /* The MI_LOAD/STORE_REGISTER_MEM commands which BLORP uses to implement + * indirect fast-clear colors can cause GPU hangs if we don't stall first. + * See genX(cmd_buffer_mi_memcpy) for more details. + */ + if (params->src.clear_color_addr.buffer || + params->dst.clear_color_addr.buffer) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "before blorp prep fast clear"); + } +#endif + + if (batch->flags & BLORP_BATCH_USE_COMPUTE) + blorp_exec_on_compute(batch, params); + else + blorp_exec_on_render(batch, params); +} diff --git a/src/intel/vulkan_hasvk/genX_cmd_buffer.c b/src/intel/vulkan_hasvk/genX_cmd_buffer.c new file mode 100644 index 00000000000..8c236c2aeba --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_cmd_buffer.c @@ -0,0 +1,7488 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#include "anv_private.h" +#include "anv_measure.h" +#include "vk_format.h" +#include "vk_render_pass.h" +#include "vk_util.h" +#include "util/fast_idiv_by_const.h" + +#include "common/intel_aux_map.h" +#include "common/intel_l3_config.h" +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" +#include "genxml/gen_rt_pack.h" +#include "common/intel_guardband.h" +#include "compiler/brw_prim.h" + +#include "nir/nir_xfb_info.h" + +#include "ds/intel_tracepoints.h" + +/* We reserve : + * - GPR 14 for secondary command buffer returns + * - GPR 15 for conditional rendering + */ +#define MI_BUILDER_NUM_ALLOC_GPRS 14 +#define __gen_get_batch_dwords anv_batch_emit_dwords +#define __gen_address_offset anv_address_add +#define __gen_get_batch_address(b, a) anv_batch_address(b, a) +#include "common/mi_builder.h" + +static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, + uint32_t pipeline); + +static enum anv_pipe_bits +convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) { + enum anv_pipe_bits bits = 0; + bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0; + bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0; +#if GFX_VERx10 >= 125 + bits |= (pc->PSSStallSyncEnable) ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0; +#endif +#if GFX_VER >= 12 + bits |= (pc->TileCacheFlushEnable) ? ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0; + bits |= (pc->HDCPipelineFlushEnable) ? ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0; +#endif + bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0; + bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0; + bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0; + bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0; + bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0; +#if GFX_VERx10 == 125 + bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0; +#endif + return bits; +} + +#define anv_debug_dump_pc(pc) \ + if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \ + fputs("pc: emit PC=( ", stderr); \ + anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \ + fprintf(stderr, ") reason: %s\n", __FUNCTION__); \ + } + +static bool +is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_queue_family *queue_family = cmd_buffer->queue_family; + return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0; +} + +void +genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_device *device = cmd_buffer->device; + uint32_t mocs = isl_mocs(&device->isl_dev, 0, false); + + /* If we are emitting a new state base address we probably need to re-emit + * binding tables. + */ + cmd_buffer->state.descriptors_dirty |= ~0; + +#if GFX_VERx10 >= 125 + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } + anv_batch_emit( + &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) { + btpa.BindingTablePoolBaseAddress = + anv_cmd_buffer_surface_base_address(cmd_buffer); + btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096; + btpa.MOCS = mocs; + } +#else /* GFX_VERx10 < 125 */ + /* Emit a render target cache flush. + * + * This isn't documented anywhere in the PRM. However, it seems to be + * necessary prior to changing the surface state base address. Without + * this, we get GPU hangs when using multi-level command buffers which + * clear depth, reset state base address, and then go render stuff. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { +#if GFX_VER >= 12 + pc.HDCPipelineFlushEnable = true; +#else + pc.DCFlushEnable = true; +#endif + pc.RenderTargetCacheFlushEnable = true; + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } + +#if GFX_VERx10 == 120 + /* Wa_1607854226: + * + * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline + * mode by putting the pipeline temporarily in 3D mode. + */ + uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline; + genX(flush_pipeline_select_3d)(cmd_buffer); +#endif + + anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) { + sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; + sba.GeneralStateMOCS = mocs; + sba.GeneralStateBaseAddressModifyEnable = true; + + sba.StatelessDataPortAccessMOCS = mocs; + + sba.SurfaceStateBaseAddress = + anv_cmd_buffer_surface_base_address(cmd_buffer); + sba.SurfaceStateMOCS = mocs; + sba.SurfaceStateBaseAddressModifyEnable = true; + + sba.DynamicStateBaseAddress = + (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 }; + sba.DynamicStateMOCS = mocs; + sba.DynamicStateBaseAddressModifyEnable = true; + + sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; + sba.IndirectObjectMOCS = mocs; + sba.IndirectObjectBaseAddressModifyEnable = true; + + sba.InstructionBaseAddress = + (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 }; + sba.InstructionMOCS = mocs; + sba.InstructionBaseAddressModifyEnable = true; + +# if (GFX_VER >= 8) + /* Broadwell requires that we specify a buffer size for a bunch of + * these fields. However, since we will be growing the BO's live, we + * just set them all to the maximum. + */ + sba.GeneralStateBufferSize = 0xfffff; + sba.IndirectObjectBufferSize = 0xfffff; + if (anv_use_relocations(device->physical)) { + sba.DynamicStateBufferSize = 0xfffff; + sba.InstructionBufferSize = 0xfffff; + } else { + /* With softpin, we use fixed addresses so we actually know how big + * our base addresses are. + */ + sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; + sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; + } + sba.GeneralStateBufferSizeModifyEnable = true; + sba.IndirectObjectBufferSizeModifyEnable = true; + sba.DynamicStateBufferSizeModifyEnable = true; + sba.InstructionBuffersizeModifyEnable = true; +# else + /* On gfx7, we have upper bounds instead. According to the docs, + * setting an upper bound of zero means that no bounds checking is + * performed so, in theory, we should be able to leave them zero. + * However, border color is broken and the GPU bounds-checks anyway. + * To avoid this and other potential problems, we may as well set it + * for everything. + */ + sba.GeneralStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.GeneralStateAccessUpperBoundModifyEnable = true; + sba.DynamicStateAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.DynamicStateAccessUpperBoundModifyEnable = true; + sba.InstructionAccessUpperBound = + (struct anv_address) { .bo = NULL, .offset = 0xfffff000 }; + sba.InstructionAccessUpperBoundModifyEnable = true; +# endif +# if (GFX_VER >= 9) + sba.BindlessSurfaceStateBaseAddress = + (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + sba.BindlessSurfaceStateMOCS = mocs; + sba.BindlessSurfaceStateBaseAddressModifyEnable = true; +# endif +# if (GFX_VER >= 10) + sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; + sba.BindlessSamplerStateMOCS = mocs; + sba.BindlessSamplerStateBaseAddressModifyEnable = true; + sba.BindlessSamplerStateBufferSize = 0; +# endif +#if GFX_VERx10 >= 125 + sba.L1CacheControl = L1CC_WB; +#endif + } + +#if GFX_VERx10 == 120 + /* Wa_1607854226: + * + * Put the pipeline back into its current mode. + */ + if (gfx12_wa_pipeline != UINT32_MAX) + genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline); +#endif + +#endif /* GFX_VERx10 < 125 */ + + /* After re-setting the surface state base address, we have to do some + * cache flushing so that the sampler engine will pick up the new + * SURFACE_STATE objects and binding tables. From the Broadwell PRM, + * Shared Function > 3D Sampler > State > State Caching (page 96): + * + * Coherency with system memory in the state cache, like the texture + * cache is handled partially by software. It is expected that the + * command stream or shader will issue Cache Flush operation or + * Cache_Flush sampler message to ensure that the L1 cache remains + * coherent with system memory. + * + * [...] + * + * Whenever the value of the Dynamic_State_Base_Addr, + * Surface_State_Base_Addr are altered, the L1 state cache must be + * invalidated to ensure the new surface or sampler state is fetched + * from system memory. + * + * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit + * which, according the PIPE_CONTROL instruction documentation in the + * Broadwell PRM: + * + * Setting this bit is independent of any other bit in this packet. + * This bit controls the invalidation of the L1 and L2 state caches + * at the top of the pipe i.e. at the parsing time. + * + * Unfortunately, experimentation seems to indicate that state cache + * invalidation through a PIPE_CONTROL does nothing whatsoever in + * regards to surface state and binding tables. In stead, it seems that + * invalidating the texture cache is what is actually needed. + * + * XXX: As far as we have been able to determine through + * experimentation, shows that flush the texture cache appears to be + * sufficient. The theory here is that all of the sampling/rendering + * units cache the binding table in the texture cache. However, we have + * yet to be able to actually confirm this. + * + * Wa_14013910100: + * + * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice + * or program pipe control with Instruction cache invalidate post + * STATE_BASE_ADDRESS command" + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.TextureCacheInvalidationEnable = true; + pc.ConstantCacheInvalidationEnable = true; + pc.StateCacheInvalidationEnable = true; +#if GFX_VERx10 == 125 + pc.InstructionCacheInvalidateEnable = true; +#endif + anv_debug_dump_pc(pc); + } +} + +static void +add_surface_reloc(struct anv_cmd_buffer *cmd_buffer, + struct anv_state state, struct anv_address addr) +{ + VkResult result; + + if (anv_use_relocations(cmd_buffer->device->physical)) { + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + result = anv_reloc_list_add(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + state.offset + isl_dev->ss.addr_offset, + addr.bo, addr.offset, NULL); + } else { + result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + addr.bo); + } + + if (unlikely(result != VK_SUCCESS)) + anv_batch_set_error(&cmd_buffer->batch, result); +} + +static void +add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer, + struct anv_surface_state state) +{ + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + + assert(!anv_address_is_null(state.address)); + add_surface_reloc(cmd_buffer, state.state, state.address); + + if (!anv_address_is_null(state.aux_address)) { + VkResult result = + anv_reloc_list_add(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + state.state.offset + isl_dev->ss.aux_addr_offset, + state.aux_address.bo, + state.aux_address.offset, + NULL); + if (result != VK_SUCCESS) + anv_batch_set_error(&cmd_buffer->batch, result); + } + + if (!anv_address_is_null(state.clear_address)) { + VkResult result = + anv_reloc_list_add(&cmd_buffer->surface_relocs, + &cmd_buffer->vk.pool->alloc, + state.state.offset + + isl_dev->ss.clear_color_state_offset, + state.clear_address.bo, + state.clear_address.offset, + NULL); + if (result != VK_SUCCESS) + anv_batch_set_error(&cmd_buffer->batch, result); + } +} + +static bool +isl_color_value_requires_conversion(union isl_color_value color, + const struct isl_surf *surf, + const struct isl_view *view) +{ + if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle)) + return false; + + uint32_t surf_pack[4] = { 0, 0, 0, 0 }; + isl_color_value_pack(&color, surf->format, surf_pack); + + uint32_t view_pack[4] = { 0, 0, 0, 0 }; + union isl_color_value swiz_color = + isl_color_value_swizzle_inv(color, view->swizzle); + isl_color_value_pack(&swiz_color, view->format, view_pack); + + return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0; +} + +static bool +anv_can_fast_clear_color_view(struct anv_device * device, + struct anv_image_view *iview, + VkImageLayout layout, + union isl_color_value clear_color, + uint32_t num_layers, + VkRect2D render_area) +{ + if (iview->planes[0].isl.base_array_layer >= + anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT, + iview->planes[0].isl.base_level)) + return false; + + /* Start by getting the fast clear type. We use the first subpass + * layout here because we don't want to fast-clear if the first subpass + * to use the attachment can't handle fast-clears. + */ + enum anv_fast_clear_type fast_clear_type = + anv_layout_to_fast_clear_type(device->info, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + layout); + switch (fast_clear_type) { + case ANV_FAST_CLEAR_NONE: + return false; + case ANV_FAST_CLEAR_DEFAULT_VALUE: + if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format)) + return false; + break; + case ANV_FAST_CLEAR_ANY: + break; + } + + /* Potentially, we could do partial fast-clears but doing so has crazy + * alignment restrictions. It's easier to just restrict to full size + * fast clears for now. + */ + if (render_area.offset.x != 0 || + render_area.offset.y != 0 || + render_area.extent.width != iview->vk.extent.width || + render_area.extent.height != iview->vk.extent.height) + return false; + + /* On Broadwell and earlier, we can only handle 0/1 clear colors */ + if (GFX_VER <= 8 && + !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format)) + return false; + + /* If the clear color is one that would require non-trivial format + * conversion on resolve, we don't bother with the fast clear. This + * shouldn't be common as most clear colors are 0/1 and the most common + * format re-interpretation is for sRGB. + */ + if (isl_color_value_requires_conversion(clear_color, + &iview->image->planes[0].primary_surface.isl, + &iview->planes[0].isl)) { + anv_perf_warn(VK_LOG_OBJS(&iview->vk.base), + "Cannot fast-clear to colors which would require " + "format conversion on resolve"); + return false; + } + + /* We only allow fast clears to the first slice of an image (level 0, + * layer 0) and only for the entire slice. This guarantees us that, at + * any given time, there is only one clear color on any given image at + * any given time. At the time of our testing (Jan 17, 2018), there + * were no known applications which would benefit from fast-clearing + * more than just the first slice. + */ + if (iview->planes[0].isl.base_level > 0 || + iview->planes[0].isl.base_array_layer > 0) { + anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), + "Rendering with multi-lod or multi-layer framebuffer " + "with LOAD_OP_LOAD and baseMipLevel > 0 or " + "baseArrayLayer > 0. Not fast clearing."); + return false; + } + + if (num_layers > 1) { + anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base), + "Rendering to a multi-layer framebuffer with " + "LOAD_OP_CLEAR. Only fast-clearing the first slice"); + } + + return true; +} + +static bool +anv_can_hiz_clear_ds_view(struct anv_device *device, + const struct anv_image_view *iview, + VkImageLayout layout, + VkImageAspectFlags clear_aspects, + float depth_clear_value, + VkRect2D render_area) +{ + /* We don't do any HiZ or depth fast-clears on gfx7 yet */ + if (GFX_VER == 7) + return false; + + /* If we're just clearing stencil, we can always HiZ clear */ + if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) + return true; + + /* We must have depth in order to have HiZ */ + if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) + return false; + + const enum isl_aux_usage clear_aux_usage = + anv_layout_to_aux_usage(device->info, iview->image, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + layout); + if (!blorp_can_hiz_clear_depth(device->info, + &iview->image->planes[0].primary_surface.isl, + clear_aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, + render_area.offset.x, + render_area.offset.y, + render_area.offset.x + + render_area.extent.width, + render_area.offset.y + + render_area.extent.height)) + return false; + + if (depth_clear_value != ANV_HZ_FC_VAL) + return false; + + /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared + * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports + * returning 0.0f. Gens prior to gfx8 do not support this feature at all. + */ + if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image)) + return false; + + /* If we got here, then we can fast clear */ + return true; +} + +#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) + +#if GFX_VER == 12 +static void +anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + const struct anv_surface *surface = &image->planes[plane].primary_surface; + uint64_t base_address = + anv_address_physical(anv_image_address(image, &surface->memory_range)); + + const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl; + uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf); + + /* We're about to live-update the AUX-TT. We really don't want anyone else + * trying to read it while we're doing this. We could probably get away + * with not having this stall in some cases if we were really careful but + * it's better to play it safe. Full stall the GPU. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "before update AUX-TT"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t a = 0; a < layer_count; a++) { + const uint32_t layer = base_layer + a; + + uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0; + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t logical_array_layer, logical_z_offset_px; + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + logical_array_layer = 0; + + /* If the given miplevel does not have this layer, then any higher + * miplevels won't either because miplevels only get smaller the + * higher the LOD. + */ + assert(layer < image->vk.extent.depth); + if (layer >= anv_minify(image->vk.extent.depth, level)) + break; + logical_z_offset_px = layer; + } else { + assert(layer < image->vk.array_layers); + logical_array_layer = layer; + logical_z_offset_px = 0; + } + + uint64_t slice_start_offset_B, slice_end_offset_B; + isl_surf_get_image_range_B_tile(isl_surf, level, + logical_array_layer, + logical_z_offset_px, + &slice_start_offset_B, + &slice_end_offset_B); + + start_offset_B = MIN2(start_offset_B, slice_start_offset_B); + end_offset_B = MAX2(end_offset_B, slice_end_offset_B); + } + + /* Aux operates 64K at a time */ + start_offset_B = align_down_u64(start_offset_B, 64 * 1024); + end_offset_B = align_u64(end_offset_B, 64 * 1024); + + for (uint64_t offset = start_offset_B; + offset < end_offset_B; offset += 64 * 1024) { + uint64_t address = base_address + offset; + + uint64_t aux_entry_addr64, *aux_entry_map; + aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx, + address, &aux_entry_addr64); + + assert(!anv_use_relocations(cmd_buffer->device->physical)); + struct anv_address aux_entry_address = { + .bo = NULL, + .offset = aux_entry_addr64, + }; + + const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map); + uint64_t new_aux_entry = + (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits; + + if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) + new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT; + + mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry)); + } + } + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, + "after update AUX-TT"); +} +#endif /* GFX_VER == 12 */ + +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout, + bool will_full_fast_clear) +{ + const uint32_t depth_plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); + if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE) + return; + +#if GFX_VER == 12 + if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || + initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && + cmd_buffer->device->physical->has_implicit_ccs && + cmd_buffer->device->info->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, 1, base_layer, layer_count); + } +#endif + + /* If will_full_fast_clear is set, the caller promises to fast-clear the + * largest portion of the specified range as it can. For depth images, + * that means the entire image because we don't support multi-LOD HiZ. + */ + assert(image->planes[0].primary_surface.isl.levels == 1); + if (will_full_fast_clear) + return; + + const enum isl_aux_state initial_state = + anv_layout_to_aux_state(cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + initial_layout); + const enum isl_aux_state final_state = + anv_layout_to_aux_state(cmd_buffer->device->info, image, + VK_IMAGE_ASPECT_DEPTH_BIT, + final_layout); + + const bool initial_depth_valid = + isl_aux_state_has_valid_primary(initial_state); + const bool initial_hiz_valid = + isl_aux_state_has_valid_aux(initial_state); + const bool final_needs_depth = + isl_aux_state_has_valid_primary(final_state); + const bool final_needs_hiz = + isl_aux_state_has_valid_aux(final_state); + + /* Getting into the pass-through state for Depth is tricky and involves + * both a resolve and an ambiguate. We don't handle that state right now + * as anv_layout_to_aux_state never returns it. + */ + assert(final_state != ISL_AUX_STATE_PASS_THROUGH); + + if (final_needs_depth && !initial_depth_valid) { + assert(initial_hiz_valid); + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE); + } else if (final_needs_hiz && !initial_hiz_valid) { + assert(initial_depth_valid); + anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT, + 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE); + } +} + +#if GFX_VER == 7 +static inline bool +vk_image_layout_stencil_write_optimal(VkImageLayout layout) +{ + return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL || + layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL; +} +#endif + +/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless + * the initial layout is undefined, the HiZ buffer and depth buffer will + * represent the same data at the end of this operation. + */ +static void +transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout, + bool will_full_fast_clear) +{ +#if GFX_VER == 7 + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + + /* On gfx7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && + vk_image_layout_stencil_write_optimal(initial_layout) && + !vk_image_layout_stencil_write_optimal(final_layout)) { + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + base_level, level_count, + base_layer, layer_count); + } +#elif GFX_VER == 12 + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE) + return; + + if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || + initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) && + cmd_buffer->device->physical->has_implicit_ccs && + cmd_buffer->device->info->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, + base_level, level_count, base_layer, layer_count); + + /* If will_full_fast_clear is set, the caller promises to fast-clear the + * largest portion of the specified range as it can. + */ + if (will_full_fast_clear) + return; + + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + const VkRect2D clear_rect = { + .offset.x = 0, + .offset.y = 0, + .extent.width = anv_minify(image->vk.extent.width, level), + .extent.height = anv_minify(image->vk.extent.height, level), + }; + + uint32_t aux_layers = + anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level); + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); + + /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression + * Enable: + * + * "When enabled, Stencil Buffer needs to be initialized via + * stencil clear (HZ_OP) before any renderpass." + */ + anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT, + level, base_layer, level_layer_count, + clear_rect, 0 /* Stencil clear value */); + } + } +#endif +} + +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_RESULT 0x2418 + +static void +set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, + uint32_t base_layer, uint32_t layer_count, + bool compressed) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + /* We only have compression tracking for CCS_E */ + if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E) + return; + + for (uint32_t a = 0; a < layer_count; a++) { + uint32_t layer = base_layer + a; + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device, + image, aspect, + level, layer); + sdi.ImmediateData = compressed ? UINT32_MAX : 0; + } + } +} + +static void +set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum anv_fast_clear_type fast_clear) +{ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect); + sdi.ImmediateData = fast_clear; + } + + /* Whenever we have fast-clear, we consider that slice to be compressed. + * This makes building predicates much easier. + */ + if (fast_clear != ANV_FAST_CLEAR_NONE) + set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true); +} + +/* This is only really practical on haswell and above because it requires + * MI math in order to get it correct. + */ +#if GFX_VERx10 >= 75 +static void +anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, uint32_t array_layer, + enum isl_aux_op resolve_op, + enum anv_fast_clear_type fast_clear_supported) +{ + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + const struct mi_value fast_clear_type = + mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); + + if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) { + /* In this case, we're doing a full resolve which means we want the + * resolve to happen if any compression (including fast-clears) is + * present. + * + * In order to simplify the logic a bit, we make the assumption that, + * if the first slice has been fast-cleared, it is also marked as + * compressed. See also set_image_fast_clear_state. + */ + const struct mi_value compression_state = + mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device, + image, aspect, + level, array_layer)); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state); + mi_store(&b, compression_state, mi_imm(0)); + + if (level == 0 && array_layer == 0) { + /* If the predicate is true, we want to write 0 to the fast clear type + * and, if it's false, leave it alone. We can do this by writing + * + * clear_type = clear_type & ~predicate; + */ + struct mi_value new_fast_clear_type = + mi_iand(&b, fast_clear_type, + mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0))); + mi_store(&b, fast_clear_type, new_fast_clear_type); + } + } else if (level == 0 && array_layer == 0) { + /* In this case, we are doing a partial resolve to get rid of fast-clear + * colors. We don't care about the compression state but we do care + * about how much fast clear is allowed by the final layout. + */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + assert(fast_clear_supported < ANV_FAST_CLEAR_ANY); + + /* We need to compute (fast_clear_supported < image->fast_clear) */ + struct mi_value pred = + mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred)); + + /* If the predicate is true, we want to write 0 to the fast clear type + * and, if it's false, leave it alone. We can do this by writing + * + * clear_type = clear_type & ~predicate; + */ + struct mi_value new_fast_clear_type = + mi_iand(&b, fast_clear_type, mi_inot(&b, pred)); + mi_store(&b, fast_clear_type, new_fast_clear_type); + } else { + /* In this case, we're trying to do a partial resolve on a slice that + * doesn't have clear color. There's nothing to do. + */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + return; + } + + /* Set src1 to 0 and use a != condition */ + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +} +#endif /* GFX_VERx10 >= 75 */ + +#if GFX_VER <= 8 +static void +anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + uint32_t level, uint32_t array_layer, + enum isl_aux_op resolve_op, + enum anv_fast_clear_type fast_clear_supported) +{ + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + struct mi_value fast_clear_type_mem = + mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device, + image, aspect)); + + /* This only works for partial resolves and only when the clear color is + * all or nothing. On the upside, this emits less command streamer code + * and works on Ivybridge and Bay Trail. + */ + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + assert(fast_clear_supported != ANV_FAST_CLEAR_ANY); + + /* We don't support fast clears on anything other than the first slice. */ + if (level > 0 || array_layer > 0) + return; + + /* On gfx8, we don't have a concept of default clear colors because we + * can't sample from CCS surfaces. It's enough to just load the fast clear + * state into the predicate register. + */ + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + mi_store(&b, fast_clear_type_mem, mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +} +#endif /* GFX_VER <= 8 */ + +static void +anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, + struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, + uint32_t level, uint32_t array_layer, + enum isl_aux_op resolve_op, + enum anv_fast_clear_type fast_clear_supported) +{ + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + +#if GFX_VER >= 9 + anv_cmd_compute_resolve_predicate(cmd_buffer, image, + aspect, level, array_layer, + resolve_op, fast_clear_supported); +#else /* GFX_VER <= 8 */ + anv_cmd_simple_resolve_predicate(cmd_buffer, image, + aspect, level, array_layer, + resolve_op, fast_clear_supported); +#endif + + /* CCS_D only supports full resolves and BLORP will assert on us if we try + * to do a partial resolve on a CCS_D surface. + */ + if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && + image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D) + resolve_op = ISL_AUX_OP_FULL_RESOLVE; + + anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect, + level, array_layer, 1, resolve_op, NULL, true); +} + +static void +anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + enum isl_format format, + struct isl_swizzle swizzle, + VkImageAspectFlagBits aspect, + uint32_t array_layer, + enum isl_aux_op resolve_op, + enum anv_fast_clear_type fast_clear_supported) +{ + assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT); + assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE); + +#if GFX_VERx10 >= 75 + anv_cmd_compute_resolve_predicate(cmd_buffer, image, + aspect, 0, array_layer, + resolve_op, fast_clear_supported); + + anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect, + array_layer, 1, resolve_op, NULL, true); +#else + unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail"); +#endif +} + +void +genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + enum isl_aux_usage aux_usage, + uint32_t level, + uint32_t base_layer, + uint32_t layer_count) +{ + /* The aspect must be exactly one of the image aspects. */ + assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects)); + + /* The only compression types with more than just fast-clears are MCS, + * CCS_E, and HiZ. With HiZ we just trust the layout and don't actually + * track the current fast-clear and compression state. This leaves us + * with just MCS and CCS_E. + */ + if (aux_usage != ISL_AUX_USAGE_CCS_E && + aux_usage != ISL_AUX_USAGE_MCS) + return; + + set_image_compressed_bit(cmd_buffer, image, aspect, + level, base_layer, layer_count, true); +} + +static void +init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect) +{ + assert(cmd_buffer && image); + assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + + set_image_fast_clear_state(cmd_buffer, image, aspect, + ANV_FAST_CLEAR_NONE); + + /* Initialize the struct fields that are accessed for fast-clears so that + * the HW restrictions on the field values are satisfied. + */ + struct anv_address addr = + anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); + + if (GFX_VER >= 9) { + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const unsigned num_dwords = GFX_VER >= 10 ? + isl_dev->ss.clear_color_state_size / 4 : + isl_dev->ss.clear_value_size / 4; + for (unsigned i = 0; i < num_dwords; i++) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = addr; + sdi.Address.offset += i * 4; + sdi.ImmediateData = 0; + } + } + } else { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) { + sdi.Address = addr; + if (GFX_VERx10 >= 75) { + /* Pre-SKL, the dword containing the clear values also contains + * other fields, so we need to initialize those fields to match the + * values that would be in a color attachment. + */ + sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 | + ISL_CHANNEL_SELECT_GREEN << 22 | + ISL_CHANNEL_SELECT_BLUE << 19 | + ISL_CHANNEL_SELECT_ALPHA << 16; + } else if (GFX_VER == 7) { + /* On IVB, the dword containing the clear values also contains + * other fields that must be zero or can be zero. + */ + sdi.ImmediateData = 0; + } + } + } +} + +/* Copy the fast-clear value dword(s) between a surface state object and an + * image's fast clear state buffer. + */ +static void +genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer, + struct anv_state surface_state, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + bool copy_from_surface_state) +{ + assert(cmd_buffer && image); + assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + + struct anv_address ss_clear_addr = { + .bo = cmd_buffer->device->surface_state_pool.block_pool.bo, + .offset = surface_state.offset + + cmd_buffer->device->isl_dev.ss.clear_value_offset, + }; + const struct anv_address entry_addr = + anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect); + unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size; + +#if GFX_VER == 7 + /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM + * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is + * in-flight when they are issued even if the memory touched is not + * currently active for rendering. The weird bit is that it is not the + * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight + * rendering hangs such that the next stalling command after the + * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang. + * + * It is unclear exactly why this hang occurs. Both MI commands come with + * warnings about the 3D pipeline but that doesn't seem to fully explain + * it. My (Jason's) best theory is that it has something to do with the + * fact that we're using a GPU state register as our temporary and that + * something with reading/writing it is causing problems. + * + * In order to work around this issue, we emit a PIPE_CONTROL with the + * command streamer stall bit set. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "after copy_fast_clear_dwords. Avoid potential hang"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +#endif + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + if (copy_from_surface_state) { + mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size); + } else { + mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size); + + /* Updating a surface state object may require that the state cache be + * invalidated. From the SKL PRM, Shared Functions -> State -> State + * Caching: + * + * Whenever the RENDER_SURFACE_STATE object in memory pointed to by + * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is + * modified [...], the L1 state cache must be invalidated to ensure + * the new surface or sampler state is fetched from system memory. + * + * In testing, SKL doesn't actually seem to need this, but HSW does. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT, + "after copy_fast_clear_dwords surface state update"); + } +} + +/** + * @brief Transitions a color buffer from one layout to another. + * + * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for + * more information. + * + * @param level_count VK_REMAINING_MIP_LEVELS isn't supported. + * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images, + * this represents the maximum layers to transition at each + * specified miplevel. + */ +static void +transition_color_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image *image, + VkImageAspectFlagBits aspect, + const uint32_t base_level, uint32_t level_count, + uint32_t base_layer, uint32_t layer_count, + VkImageLayout initial_layout, + VkImageLayout final_layout, + uint64_t src_queue_family, + uint64_t dst_queue_family, + bool will_full_fast_clear) +{ + struct anv_device *device = cmd_buffer->device; + const struct intel_device_info *devinfo = device->info; + /* Validate the inputs. */ + assert(cmd_buffer); + assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV); + /* These values aren't supported for simplicity's sake. */ + assert(level_count != VK_REMAINING_MIP_LEVELS && + layer_count != VK_REMAINING_ARRAY_LAYERS); + /* Ensure the subresource range is valid. */ + UNUSED uint64_t last_level_num = base_level + level_count; + const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level); + UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth); + assert((uint64_t)base_layer + layer_count <= image_layers); + assert(last_level_num <= image->vk.mip_levels); + /* If there is a layout transfer, the final layout cannot be undefined or + * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198). + */ + assert(initial_layout == final_layout || + (final_layout != VK_IMAGE_LAYOUT_UNDEFINED && + final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED)); + const struct isl_drm_modifier_info *isl_mod_info = + image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT + ? isl_drm_modifier_get_info(image->vk.drm_format_mod) + : NULL; + + const bool src_queue_external = + src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || + src_queue_family == VK_QUEUE_FAMILY_EXTERNAL; + + const bool dst_queue_external = + dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT || + dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL; + + /* Simultaneous acquire and release on external queues is illegal. */ + assert(!src_queue_external || !dst_queue_external); + + /* Ownership transition on an external queue requires special action if the + * image has a DRM format modifier because we store image data in + * a driver-private bo which is inaccessible to the external queue. + */ + const bool private_binding_acquire = + src_queue_external && + anv_image_is_externally_shared(image) && + anv_image_has_private_binding(image); + + const bool private_binding_release = + dst_queue_external && + anv_image_is_externally_shared(image) && + anv_image_has_private_binding(image); + + if (initial_layout == final_layout && + !private_binding_acquire && !private_binding_release) { + /* No work is needed. */ + return; + } + + const uint32_t plane = anv_image_aspect_to_plane(image, aspect); + + if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && + final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { + /* This surface is a linear compressed image with a tiled shadow surface + * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so + * we need to ensure the shadow copy is up-to-date. + */ + assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT); + assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT); + assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR); + assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR); + assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format)); + assert(plane == 0); + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_COLOR_BIT, + base_level, level_count, + base_layer, layer_count); + } + + if (base_layer >= anv_image_aux_layers(image, aspect, base_level)) + return; + + assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR); + + /* The following layouts are equivalent for non-linear images. */ + const bool initial_layout_undefined = + initial_layout == VK_IMAGE_LAYOUT_UNDEFINED || + initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED; + + bool must_init_fast_clear_state = false; + bool must_init_aux_surface = false; + + if (initial_layout_undefined) { + /* The subresource may have been aliased and populated with arbitrary + * data. + */ + must_init_fast_clear_state = true; + must_init_aux_surface = true; + } else if (private_binding_acquire) { + /* The fast clear state lives in a driver-private bo, and therefore the + * external/foreign queue is unaware of it. + * + * If this is the first time we are accessing the image, then the fast + * clear state is uninitialized. + * + * If this is NOT the first time we are accessing the image, then the fast + * clear state may still be valid and correct due to the resolve during + * our most recent ownership release. However, we do not track the aux + * state with MI stores, and therefore must assume the worst-case: that + * this is the first time we are accessing the image. + */ + assert(image->planes[plane].fast_clear_memory_range.binding == + ANV_IMAGE_MEMORY_BINDING_PRIVATE); + must_init_fast_clear_state = true; + + if (image->planes[plane].aux_surface.memory_range.binding == + ANV_IMAGE_MEMORY_BINDING_PRIVATE) { + assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE); + + /* The aux surface, like the fast clear state, lives in + * a driver-private bo. We must initialize the aux surface for the + * same reasons we must initialize the fast clear state. + */ + must_init_aux_surface = true; + } else { + assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE); + + /* The aux surface, unlike the fast clear state, lives in + * application-visible VkDeviceMemory and is shared with the + * external/foreign queue. Therefore, when we acquire ownership of the + * image with a defined VkImageLayout, the aux surface is valid and has + * the aux state required by the modifier. + */ + must_init_aux_surface = false; + } + } + +#if GFX_VER == 12 + if (initial_layout_undefined) { + if (device->physical->has_implicit_ccs && devinfo->has_aux_map) { + anv_image_init_aux_tt(cmd_buffer, image, aspect, + base_level, level_count, + base_layer, layer_count); + } + } +#else + assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map)); +#endif + + if (must_init_fast_clear_state) { + if (base_level == 0 && base_layer == 0) + init_fast_clear_color(cmd_buffer, image, aspect); + } + + if (must_init_aux_surface) { + assert(must_init_fast_clear_state); + + /* Initialize the aux buffers to enable correct rendering. In order to + * ensure that things such as storage images work correctly, aux buffers + * need to be initialized to valid data. + * + * Having an aux buffer with invalid data is a problem for two reasons: + * + * 1) Having an invalid value in the buffer can confuse the hardware. + * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is + * invalid and leads to the hardware doing strange things. It + * doesn't hang as far as we can tell but rendering corruption can + * occur. + * + * 2) If this transition is into the GENERAL layout and we then use the + * image as a storage image, then we must have the aux buffer in the + * pass-through state so that, if we then go to texture from the + * image, we get the results of our storage image writes and not the + * fast clear color or other random data. + * + * For CCS both of the problems above are real demonstrable issues. In + * that case, the only thing we can do is to perform an ambiguate to + * transition the aux surface into the pass-through state. + * + * For MCS, (2) is never an issue because we don't support multisampled + * storage images. In theory, issue (1) is a problem with MCS but we've + * never seen it in the wild. For 4x and 16x, all bit patters could, in + * theory, be interpreted as something but we don't know that all bit + * patterns are actually valid. For 2x and 8x, you could easily end up + * with the MCS referring to an invalid plane because not all bits of + * the MCS value are actually used. Even though we've never seen issues + * in the wild, it's best to play it safe and initialize the MCS. We + * can use a fast-clear for MCS because we only ever touch from render + * and texture (no image load store). + */ + if (image->vk.samples == 1) { + for (uint32_t l = 0; l < level_count; l++) { + const uint32_t level = base_level + l; + + uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); + if (base_layer >= aux_layers) + break; /* We will only get fewer layers as level increases */ + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); + + /* If will_full_fast_clear is set, the caller promises to + * fast-clear the largest portion of the specified range as it can. + * For color images, that means only the first LOD and array slice. + */ + if (level == 0 && base_layer == 0 && will_full_fast_clear) { + base_layer++; + level_layer_count--; + if (level_layer_count == 0) + continue; + } + + anv_image_ccs_op(cmd_buffer, image, + image->planes[plane].primary_surface.isl.format, + ISL_SWIZZLE_IDENTITY, + aspect, level, base_layer, level_layer_count, + ISL_AUX_OP_AMBIGUATE, NULL, false); + + if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) { + set_image_compressed_bit(cmd_buffer, image, aspect, + level, base_layer, level_layer_count, + false); + } + } + } else { + if (image->vk.samples == 4 || image->vk.samples == 16) { + anv_perf_warn(VK_LOG_OBJS(&image->vk.base), + "Doing a potentially unnecessary fast-clear to " + "define an MCS buffer."); + } + + /* If will_full_fast_clear is set, the caller promises to fast-clear + * the largest portion of the specified range as it can. + */ + if (will_full_fast_clear) + return; + + assert(base_level == 0 && level_count == 1); + anv_image_mcs_op(cmd_buffer, image, + image->planes[plane].primary_surface.isl.format, + ISL_SWIZZLE_IDENTITY, + aspect, base_layer, layer_count, + ISL_AUX_OP_FAST_CLEAR, NULL, false); + } + return; + } + + enum isl_aux_usage initial_aux_usage = + anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout); + enum isl_aux_usage final_aux_usage = + anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout); + enum anv_fast_clear_type initial_fast_clear = + anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout); + enum anv_fast_clear_type final_fast_clear = + anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout); + + /* We must override the anv_layout_to_* functions because they are unaware of + * acquire/release direction. + */ + if (private_binding_acquire) { + initial_aux_usage = isl_mod_info->aux_usage; + initial_fast_clear = isl_mod_info->supports_clear_color ? + initial_fast_clear : ANV_FAST_CLEAR_NONE; + } else if (private_binding_release) { + final_aux_usage = isl_mod_info->aux_usage; + final_fast_clear = isl_mod_info->supports_clear_color ? + final_fast_clear : ANV_FAST_CLEAR_NONE; + } + + /* The current code assumes that there is no mixing of CCS_E and CCS_D. + * We can handle transitions between CCS_D/E to and from NONE. What we + * don't yet handle is switching between CCS_E and CCS_D within a given + * image. Doing so in a performant way requires more detailed aux state + * tracking such as what is done in i965. For now, just assume that we + * only have one type of compression. + */ + assert(initial_aux_usage == ISL_AUX_USAGE_NONE || + final_aux_usage == ISL_AUX_USAGE_NONE || + initial_aux_usage == final_aux_usage); + + /* If initial aux usage is NONE, there is nothing to resolve */ + if (initial_aux_usage == ISL_AUX_USAGE_NONE) + return; + + enum isl_aux_op resolve_op = ISL_AUX_OP_NONE; + + /* If the initial layout supports more fast clear than the final layout + * then we need at least a partial resolve. + */ + if (final_fast_clear < initial_fast_clear) + resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE; + + if (initial_aux_usage == ISL_AUX_USAGE_CCS_E && + final_aux_usage != ISL_AUX_USAGE_CCS_E) + resolve_op = ISL_AUX_OP_FULL_RESOLVE; + + if (resolve_op == ISL_AUX_OP_NONE) + return; + + /* Perform a resolve to synchronize data between the main and aux buffer. + * Before we begin, we must satisfy the cache flushing requirement specified + * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)": + * + * Any transition from any value in {Clear, Render, Resolve} to a + * different value in {Clear, Render, Resolve} requires end of pipe + * synchronization. + * + * We perform a flush of the write cache before and after the clear and + * resolve operations to meet this requirement. + * + * Unlike other drawing, fast clear operations are not properly + * synchronized. The first PIPE_CONTROL here likely ensures that the + * contents of the previous render or clear hit the render target before we + * resolve and the second likely ensures that the resolve is complete before + * we do any more rendering or clearing. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after transition RT"); + + for (uint32_t l = 0; l < level_count; l++) { + uint32_t level = base_level + l; + + uint32_t aux_layers = anv_image_aux_layers(image, aspect, level); + if (base_layer >= aux_layers) + break; /* We will only get fewer layers as level increases */ + uint32_t level_layer_count = + MIN2(layer_count, aux_layers - base_layer); + + for (uint32_t a = 0; a < level_layer_count; a++) { + uint32_t array_layer = base_layer + a; + + /* If will_full_fast_clear is set, the caller promises to fast-clear + * the largest portion of the specified range as it can. For color + * images, that means only the first LOD and array slice. + */ + if (level == 0 && array_layer == 0 && will_full_fast_clear) + continue; + + if (image->vk.samples == 1) { + anv_cmd_predicated_ccs_resolve(cmd_buffer, image, + image->planes[plane].primary_surface.isl.format, + ISL_SWIZZLE_IDENTITY, + aspect, level, array_layer, resolve_op, + final_fast_clear); + } else { + /* We only support fast-clear on the first layer so partial + * resolves should not be used on other layers as they will use + * the clear color stored in memory that is only valid for layer0. + */ + if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE && + array_layer != 0) + continue; + + anv_cmd_predicated_mcs_resolve(cmd_buffer, image, + image->planes[plane].primary_surface.isl.format, + ISL_SWIZZLE_IDENTITY, + aspect, array_layer, resolve_op, + final_fast_clear); + } + } + } + + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "after transition RT"); +} + +static MUST_CHECK VkResult +anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer, + uint32_t color_att_count) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + /* Reserve one for the NULL state. */ + unsigned num_states = 1 + color_att_count; + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align); + gfx->att_states = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + num_states * ss_stride, isl_dev->ss.align); + if (gfx->att_states.map == NULL) { + return anv_batch_set_error(&cmd_buffer->batch, + VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + struct anv_state next_state = gfx->att_states; + next_state.alloc_size = isl_dev->ss.size; + + gfx->null_surface_state = next_state; + next_state.offset += ss_stride; + next_state.map += ss_stride; + + gfx->color_att_count = color_att_count; + for (uint32_t i = 0; i < color_att_count; i++) { + gfx->color_att[i] = (struct anv_attachment) { + .surface_state.state = next_state, + }; + next_state.offset += ss_stride; + next_state.map += ss_stride; + } + gfx->depth_att = (struct anv_attachment) { }; + gfx->stencil_att = (struct anv_attachment) { }; + + return VK_SUCCESS; +} + +static void +anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + gfx->render_area = (VkRect2D) { }; + gfx->layer_count = 0; + gfx->samples = 0; + + gfx->color_att_count = 0; + gfx->depth_att = (struct anv_attachment) { }; + gfx->stencil_att = (struct anv_attachment) { }; + gfx->null_surface_state = ANV_STATE_NULL; +} + +VkResult +genX(BeginCommandBuffer)( + VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo* pBeginInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + VkResult result; + + /* If this is the first vkBeginCommandBuffer, we must *initialize* the + * command buffer's state. Otherwise, we must *reset* its state. In both + * cases we reset it. + * + * From the Vulkan 1.0 spec: + * + * If a command buffer is in the executable state and the command buffer + * was allocated from a command pool with the + * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then + * vkBeginCommandBuffer implicitly resets the command buffer, behaving + * as if vkResetCommandBuffer had been called with + * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts + * the command buffer in the recording state. + */ + anv_cmd_buffer_reset(cmd_buffer); + anv_cmd_buffer_reset_rendering(cmd_buffer); + + cmd_buffer->usage_flags = pBeginInfo->flags; + + /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for + * primary level command buffers. + * + * From the Vulkan 1.0 spec: + * + * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a + * secondary command buffer is considered to be entirely inside a render + * pass. If this is a primary command buffer, then this bit is ignored. + */ + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) + cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT; + + trace_intel_begin_cmd_buffer(&cmd_buffer->trace); + + genX(cmd_buffer_emit_state_base_address)(cmd_buffer); + + /* We sometimes store vertex data in the dynamic state buffer for blorp + * operations and our dynamic state stream may re-use data from previous + * command buffers. In order to prevent stale cache data, we flush the VF + * cache. We could do this on every blorp call but that's not really + * needed as all of the data will get written by the CPU prior to the GPU + * executing anything. The chances are fairly high that they will use + * blorp at least once per primary command buffer so it shouldn't be + * wasted. + * + * There is also a workaround on gfx8 which requires us to invalidate the + * VF cache occasionally. It's easier if we can assume we start with a + * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).) + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "new cmd buffer"); + + /* Re-emit the aux table register in every command buffer. This way we're + * ensured that we have the table even if this command buffer doesn't + * initialize any images. + */ + if (cmd_buffer->device->info->has_aux_map) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_AUX_TABLE_INVALIDATE_BIT, + "new cmd buffer with aux-tt"); + } + + /* We send an "Indirect State Pointers Disable" packet at + * EndCommandBuffer, so all push constant packets are ignored during a + * context restore. Documentation says after that command, we need to + * emit push constants again before any rendering operation. So we + * flag them dirty here to make sure they get emitted. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; + + if (cmd_buffer->usage_flags & + VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)]; + const VkRenderingInfo *resume_info = + vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level, + pBeginInfo, + gcbiar_data); + if (resume_info != NULL) { + genX(CmdBeginRendering)(commandBuffer, resume_info); + } else { + const VkCommandBufferInheritanceRenderingInfo *inheritance_info = + vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, + pBeginInfo); + assert(inheritance_info); + + gfx->rendering_flags = inheritance_info->flags; + gfx->render_area = (VkRect2D) { }; + gfx->layer_count = 0; + gfx->samples = inheritance_info->rasterizationSamples; + gfx->view_mask = inheritance_info->viewMask; + + uint32_t color_att_count = inheritance_info->colorAttachmentCount; + result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count); + if (result != VK_SUCCESS) + return result; + + for (uint32_t i = 0; i < color_att_count; i++) { + gfx->color_att[i].vk_format = + inheritance_info->pColorAttachmentFormats[i]; + } + gfx->depth_att.vk_format = + inheritance_info->depthAttachmentFormat; + gfx->stencil_att.vk_format = + inheritance_info->stencilAttachmentFormat; + + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + } + } + +#if GFX_VER >= 8 + /* Emit the sample pattern at the beginning of the batch because the + * default locations emitted at the device initialization might have been + * changed by a previous command buffer. + * + * Do not change that when we're continuing a previous renderpass. + */ + if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations && + !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) + genX(emit_sample_pattern)(&cmd_buffer->batch, NULL); +#endif + +#if GFX_VERx10 >= 75 + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { + const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info = + vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT); + + /* If secondary buffer supports conditional rendering + * we should emit commands as if conditional rendering is enabled. + */ + cmd_buffer->state.conditional_render_enabled = + conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable; + } +#endif + + return VK_SUCCESS; +} + +/* From the PRM, Volume 2a: + * + * "Indirect State Pointers Disable + * + * At the completion of the post-sync operation associated with this pipe + * control packet, the indirect state pointers in the hardware are + * considered invalid; the indirect pointers are not saved in the context. + * If any new indirect state commands are executed in the command stream + * while the pipe control is pending, the new indirect state commands are + * preserved. + * + * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context + * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant + * commands are only considered as Indirect State Pointers. Once ISP is + * issued in a context, SW must initialize by programming push constant + * commands for all the shaders (at least to zero length) before attempting + * any rendering operation for the same context." + * + * 3DSTATE_CONSTANT_* packets are restored during a context restore, + * even though they point to a BO that has been already unreferenced at + * the end of the previous batch buffer. This has been fine so far since + * we are protected by these scratch page (every address not covered by + * a BO should be pointing to the scratch page). But on CNL, it is + * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_* + * instruction. + * + * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the + * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a + * context restore, so the mentioned hang doesn't happen. However, + * software must program push constant commands for all stages prior to + * rendering anything. So we flag them dirty in BeginCommandBuffer. + * + * Finally, we also make sure to stall at pixel scoreboard to make sure the + * constants have been loaded into the EUs prior to disable the push constants + * so that it doesn't hang a previous 3DPRIMITIVE. + */ +static void +emit_isp_disable(struct anv_cmd_buffer *cmd_buffer) +{ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.IndirectStatePointersDisable = true; + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } +} + +VkResult +genX(EndCommandBuffer)( + VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return cmd_buffer->batch.status; + + anv_measure_endcommandbuffer(cmd_buffer); + + /* We want every command buffer to start with the PMA fix in a known state, + * so we disable it at the end of the command buffer. + */ + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false); + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + emit_isp_disable(cmd_buffer); + + trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level); + + anv_cmd_buffer_end_batch_buffer(cmd_buffer); + + return VK_SUCCESS; +} + +void +genX(CmdExecuteCommands)( + VkCommandBuffer commandBuffer, + uint32_t commandBufferCount, + const VkCommandBuffer* pCmdBuffers) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer); + + assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + + if (anv_batch_has_error(&primary->batch)) + return; + + /* The secondary command buffers will assume that the PMA fix is disabled + * when they begin executing. Make sure this is true. + */ + genX(cmd_buffer_enable_pma_fix)(primary, false); + + /* The secondary command buffer doesn't know which textures etc. have been + * flushed prior to their execution. Apply those flushes now. + */ + genX(cmd_buffer_apply_pipe_flushes)(primary); + + for (uint32_t i = 0; i < commandBufferCount; i++) { + ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]); + + assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY); + assert(!anv_batch_has_error(&secondary->batch)); + +#if GFX_VERx10 >= 75 + if (secondary->state.conditional_render_enabled) { + if (!primary->state.conditional_render_enabled) { + /* Secondary buffer is constructed as if it will be executed + * with conditional rendering, we should satisfy this dependency + * regardless of conditional rendering being enabled in primary. + */ + struct mi_builder b; + mi_builder_init(&b, primary->device->info, &primary->batch); + mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), + mi_imm(UINT64_MAX)); + } + } +#endif + + if (secondary->usage_flags & + VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { + /* If we're continuing a render pass from the primary, we need to + * copy the surface states for the current subpass into the storage + * we allocated for them in BeginCommandBuffer. + */ + struct anv_bo *ss_bo = + primary->device->surface_state_pool.block_pool.bo; + struct anv_state src_state = primary->state.gfx.att_states; + struct anv_state dst_state = secondary->state.gfx.att_states; + assert(src_state.alloc_size == dst_state.alloc_size); + + genX(cmd_buffer_so_memcpy)(primary, + (struct anv_address) { + .bo = ss_bo, + .offset = dst_state.offset, + }, + (struct anv_address) { + .bo = ss_bo, + .offset = src_state.offset, + }, + src_state.alloc_size); + } + + anv_cmd_buffer_add_secondary(primary, secondary); + + assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL || + secondary->perf_query_pool == primary->perf_query_pool); + if (secondary->perf_query_pool) + primary->perf_query_pool = secondary->perf_query_pool; + +#if GFX_VERx10 == 120 + if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN) + primary->state.depth_reg_mode = secondary->state.depth_reg_mode; +#endif + } + + /* The secondary isn't counted in our VF cache tracking so we need to + * invalidate the whole thing. + */ + if (GFX_VER >= 8 && GFX_VER <= 9) { + anv_add_pending_pipe_bits(primary, + ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "Secondary cmd buffer not tracked in VF cache"); + } + + /* The secondary may have selected a different pipeline (3D or compute) and + * may have changed the current L3$ configuration. Reset our tracking + * variables to invalid values to ensure that we re-emit these in the case + * where we do any draws or compute dispatches from the primary after the + * secondary has returned. + */ + primary->state.current_pipeline = UINT32_MAX; + primary->state.current_l3_config = NULL; + primary->state.current_hash_scale = 0; + primary->state.gfx.push_constant_stages = 0; + vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state); + + /* Each of the secondary command buffers will use its own state base + * address. We need to re-emit state base address for the primary after + * all of the secondaries are done. + * + * TODO: Maybe we want to make this a dirty bit to avoid extra state base + * address calls? + */ + genX(cmd_buffer_emit_state_base_address)(primary); +} + +/** + * Program the hardware to use the specified L3 configuration. + */ +void +genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer, + const struct intel_l3_config *cfg) +{ + assert(cfg || GFX_VER >= 12); + if (cfg == cmd_buffer->state.current_l3_config) + return; + +#if GFX_VER >= 11 + /* On Gfx11+ we use only one config, so verify it remains the same and skip + * the stalling programming entirely. + */ + assert(cfg == cmd_buffer->device->l3_config); +#else + if (INTEL_DEBUG(DEBUG_L3)) { + mesa_logd("L3 config transition: "); + intel_dump_l3_config(cfg, stderr); + } + + /* According to the hardware docs, the L3 partitioning can only be changed + * while the pipeline is completely drained and the caches are flushed, + * which involves a first PIPE_CONTROL flush which stalls the pipeline... + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DCFlushEnable = true; + pc.PostSyncOperation = NoWrite; + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } + + /* ...followed by a second pipelined PIPE_CONTROL that initiates + * invalidation of the relevant caches. Note that because RO invalidation + * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL + * command is processed by the CS) we cannot combine it with the previous + * stalling flush as the hardware documentation suggests, because that + * would cause the CS to stall on previous rendering *after* RO + * invalidation and wouldn't prevent the RO caches from being polluted by + * concurrent rendering before the stall completes. This intentionally + * doesn't implement the SKL+ hardware workaround suggesting to enable CS + * stall on PIPE_CONTROLs with the texture cache invalidation bit set for + * GPGPU workloads because the previous and subsequent PIPE_CONTROLs + * already guarantee that there is no concurrent GPGPU kernel execution + * (see SKL HSD 2132585). + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.TextureCacheInvalidationEnable = true; + pc.ConstantCacheInvalidationEnable = true; + pc.InstructionCacheInvalidateEnable = true; + pc.StateCacheInvalidationEnable = true; + pc.PostSyncOperation = NoWrite; + anv_debug_dump_pc(pc); + } + + /* Now send a third stalling flush to make sure that invalidation is + * complete when the L3 configuration registers are modified. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DCFlushEnable = true; + pc.PostSyncOperation = NoWrite; + pc.CommandStreamerStallEnable = true; + anv_debug_dump_pc(pc); + } + + genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg); +#endif /* GFX_VER >= 11 */ + cmd_buffer->state.current_l3_config = cfg; +} + +enum anv_pipe_bits +genX(emit_apply_pipe_flushes)(struct anv_batch *batch, + struct anv_device *device, + uint32_t current_pipeline, + enum anv_pipe_bits bits) +{ + /* + * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": + * + * Write synchronization is a special case of end-of-pipe + * synchronization that requires that the render cache and/or depth + * related caches are flushed to memory, where the data will become + * globally visible. This type of synchronization is required prior to + * SW (CPU) actually reading the result data from memory, or initiating + * an operation that will use as a read surface (such as a texture + * surface) a previous render target and/or depth/stencil buffer + * + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Exercising the write cache flush bits (Render Target Cache Flush + * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only + * ensures the write caches are flushed and doesn't guarantee the data + * is globally visible. + * + * SW can track the completion of the end-of-pipe-synchronization by + * using "Notify Enable" and "PostSync Operation - Write Immediate + * Data" in the PIPE_CONTROL command. + * + * In other words, flushes are pipelined while invalidations are handled + * immediately. Therefore, if we're flushing anything then we need to + * schedule an end-of-pipe sync before any invalidations can happen. + */ + if (bits & ANV_PIPE_FLUSH_BITS) + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + + + /* HSD 1209978178: docs say that before programming the aux table: + * + * "Driver must ensure that the engine is IDLE but ensure it doesn't + * add extra flushes in the case it knows that the engine is already + * IDLE." + */ + if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) + bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + + /* If we're going to do an invalidate and we have a pending end-of-pipe + * sync that has yet to be resolved, we do the end-of-pipe sync now. + */ + if ((bits & ANV_PIPE_INVALIDATE_BITS) && + (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) { + bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT; + bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT; + } + + /* Project: SKL / Argument: LRI Post Sync Operation [23] + * + * "PIPECONTROL command with “Command Streamer Stall Enable” must be + * programmed prior to programming a PIPECONTROL command with "LRI + * Post Sync Operation" in GPGPU mode of operation (i.e when + * PIPELINE_SELECT command is set to GPGPU mode of operation)." + * + * The same text exists a few rows below for Post Sync Op. + */ + if (bits & ANV_PIPE_POST_SYNC_BIT) { + if (GFX_VER == 9 && current_pipeline == GPGPU) + bits |= ANV_PIPE_CS_STALL_BIT; + bits &= ~ANV_PIPE_POST_SYNC_BIT; + } + + if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | + ANV_PIPE_END_OF_PIPE_SYNC_BIT)) { + anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) { +#if GFX_VERx10 >= 125 + /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush: + * + * "'HDC Pipeline Flush' bit must be set for this bit to take + * effect." + * + * BSpec 47112: PIPE_CONTROL::HDC Pipeline Flush: + * + * "When the "Pipeline Select" mode in PIPELINE_SELECT command is + * set to "3D", HDC Pipeline Flush can also flush/invalidate the + * LSC Untyped L1 cache based on the programming of HDC_Chicken0 + * register bits 13:11." + * + * "When the 'Pipeline Select' mode is set to 'GPGPU', the LSC + * Untyped L1 cache flush is controlled by 'Untyped Data-Port + * Cache Flush' bit in the PIPE_CONTROL command." + * + * As part of Wa_1608949956 & Wa_14010198302, i915 is programming + * HDC_CHICKEN0[11:13] = 0 ("Untyped L1 is flushed, for both 3D + * Pipecontrol Dataport flush, and UAV coherency barrier event"). + * So there is no need to set "Untyped Data-Port Cache" in 3D + * mode. + */ + pipe.UntypedDataPortCacheFlushEnable = + (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) && + current_pipeline == GPGPU; + pipe.HDCPipelineFlushEnable |= pipe.UntypedDataPortCacheFlushEnable; +#endif +#if GFX_VER >= 12 + pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT; + pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; +#else + /* Flushing HDC pipeline requires DC Flush on earlier HW. */ + pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT; +#endif + pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT; + pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT; + pipe.RenderTargetCacheFlushEnable = + bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; + + /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ +#if GFX_VER >= 12 + pipe.DepthStallEnable = + pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT); +#else + pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT; +#endif + +#if GFX_VERx10 >= 125 + pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT; +#endif + + pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT; +#if GFX_VER == 8 + /* From Broadwell PRM, volume 2a: + * PIPE_CONTROL: Command Streamer Stall Enable: + * + * "This bit must be always set when PIPE_CONTROL command is + * programmed by GPGPU and MEDIA workloads, except for the cases + * when only Read Only Cache Invalidation bits are set (State + * Cache Invalidation Enable, Instruction cache Invalidation + * Enable, Texture Cache Invalidation Enable, Constant Cache + * Invalidation Enable). This is to WA FFDOP CG issue, this WA + * need not implemented when FF_DOP_CG is disabled." + * + * Since we do all the invalidation in the following PIPE_CONTROL, + * if we got here, we need a stall. + */ + pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU; +#endif + + pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT; + + /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": + * + * "The most common action to perform upon reaching a + * synchronization point is to write a value out to memory. An + * immediate value (included with the synchronization command) may + * be written." + * + * + * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": + * + * "In case the data flushed out by the render engine is to be + * read back in to the render engine in coherent manner, then the + * render engine has to wait for the fence completion before + * accessing the flushed data. This can be achieved by following + * means on various products: PIPE_CONTROL command with CS Stall + * and the required write caches flushed with Post-Sync-Operation + * as Write Immediate Data. + * + * Example: + * - Workload-1 (3D/GPGPU/MEDIA) + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + */ + if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) { + pipe.CommandStreamerStallEnable = true; + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = device->workaround_address; + } + + /* + * According to the Broadwell documentation, any PIPE_CONTROL with the + * "Command Streamer Stall" bit set must also have another bit set, + * with five different options: + * + * - Render Target Cache Flush + * - Depth Cache Flush + * - Stall at Pixel Scoreboard + * - Post-Sync Operation + * - Depth Stall + * - DC Flush Enable + * + * I chose "Stall at Pixel Scoreboard" since that's what we use in + * mesa and it seems to work fine. The choice is fairly arbitrary. + */ + if (pipe.CommandStreamerStallEnable && + !pipe.RenderTargetCacheFlushEnable && + !pipe.DepthCacheFlushEnable && + !pipe.StallAtPixelScoreboard && + !pipe.PostSyncOperation && + !pipe.DepthStallEnable && + !pipe.DCFlushEnable) + pipe.StallAtPixelScoreboard = true; + anv_debug_dump_pc(pipe); + } + + /* If a render target flush was emitted, then we can toggle off the bit + * saying that render target writes are ongoing. + */ + if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT) + bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES); + + if (GFX_VERx10 == 75) { + /* Haswell needs addition work-arounds: + * + * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": + * + * Option 1: + * PIPE_CONTROL command with the CS Stall and the required write + * caches flushed with Post-SyncOperation as Write Immediate Data + * followed by eight dummy MI_STORE_DATA_IMM (write to scratch + * spce) commands. + * + * Example: + * - Workload-1 + * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write + * Immediate Data, Required Write Cache Flush bits set) + * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address) + * - Workload-2 (Can use the data produce or output by + * Workload-1) + * + * Unfortunately, both the PRMs and the internal docs are a bit + * out-of-date in this regard. What the windows driver does (and + * this appears to actually work) is to emit a register read from the + * memory address written by the pipe control above. + * + * What register we load into doesn't matter. We choose an indirect + * rendering register because we know it always exists and it's one + * of the first registers the command parser allows us to write. If + * you don't have command parser support in your kernel (pre-4.2), + * this will get turned into MI_NOOP and you won't get the + * workaround. Unfortunately, there's just not much we can do in + * that case. This register is perfectly safe to write since we + * always re-load all of the indirect draw registers right before + * 3DPRIMITIVE when needed anyway. + */ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */ + lrm.MemoryAddress = device->workaround_address; + } + } + + bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | + ANV_PIPE_END_OF_PIPE_SYNC_BIT); + } + + if (bits & ANV_PIPE_INVALIDATE_BITS) { + /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", + * + * "If the VF Cache Invalidation Enable is set to a 1 in a + * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to + * 0, with the VF Cache Invalidation Enable set to 0 needs to be sent + * prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to + * a 1." + * + * This appears to hang Broadwell, so we restrict it to just gfx9. + */ + if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) + anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe); + + anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) { + pipe.StateCacheInvalidationEnable = + bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT; + pipe.ConstantCacheInvalidationEnable = + bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT; +#if GFX_VER >= 12 + /* Invalidates the L3 cache part in which index & vertex data is loaded + * when VERTEX_BUFFER_STATE::L3BypassDisable is set. + */ + pipe.L3ReadOnlyCacheInvalidationEnable = + bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; +#endif + pipe.VFCacheInvalidationEnable = + bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT; + pipe.TextureCacheInvalidationEnable = + bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT; + pipe.InstructionCacheInvalidateEnable = + bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT; + + /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL", + * + * "When VF Cache Invalidate is set “Post Sync Operation” must be + * enabled to “Write Immediate Data” or “Write PS Depth Count” or + * “Write Timestamp”. + */ + if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) { + pipe.PostSyncOperation = WriteImmediateData; + pipe.Address = device->workaround_address; + } + anv_debug_dump_pc(pipe); + } + +#if GFX_VER == 12 + if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) { + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num); + lri.DataDWord = 1; + } + } +#endif + + bits &= ~ANV_PIPE_INVALIDATE_BITS; + } + + return bits; +} + +void +genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer) +{ + enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits; + + if (unlikely(cmd_buffer->device->physical->always_flush_cache)) + bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS; + else if (bits == 0) + return; + + bool trace_flush = + (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0; + if (trace_flush) + trace_intel_begin_stall(&cmd_buffer->trace); + + if ((GFX_VER >= 8 && GFX_VER <= 9) && + (bits & ANV_PIPE_CS_STALL_BIT) && + (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) { + /* If we are doing a VF cache invalidate AND a CS stall (it must be + * both) then we can reset our vertex cache tracking. + */ + memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0, + sizeof(cmd_buffer->state.gfx.vb_dirty_ranges)); + memset(&cmd_buffer->state.gfx.ib_dirty_range, 0, + sizeof(cmd_buffer->state.gfx.ib_dirty_range)); + } + + cmd_buffer->state.pending_pipe_bits = + genX(emit_apply_pipe_flushes)(&cmd_buffer->batch, + cmd_buffer->device, + cmd_buffer->state.current_pipeline, + bits); + + if (trace_flush) { + trace_intel_end_stall(&cmd_buffer->trace, bits, + anv_pipe_flush_bit_to_ds_stall_flag, NULL); + } +} + +static void +cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer, + const VkDependencyInfo *dep_info, + const char *reason) +{ + /* XXX: Right now, we're really dumb and just flush whatever categories + * the app asks for. One of these days we may make this a bit better + * but right now that's all the hardware allows for in most areas. + */ + VkAccessFlags2 src_flags = 0; + VkAccessFlags2 dst_flags = 0; + + for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) { + src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask; + dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask; + } + + for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) { + src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask; + dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask; + } + + for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) { + const VkImageMemoryBarrier2 *img_barrier = + &dep_info->pImageMemoryBarriers[i]; + + src_flags |= img_barrier->srcAccessMask; + dst_flags |= img_barrier->dstAccessMask; + + ANV_FROM_HANDLE(anv_image, image, img_barrier->image); + const VkImageSubresourceRange *range = &img_barrier->subresourceRange; + + uint32_t base_layer, layer_count; + if (image->vk.image_type == VK_IMAGE_TYPE_3D) { + base_layer = 0; + layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel); + } else { + base_layer = range->baseArrayLayer; + layer_count = vk_image_subresource_layer_count(&image->vk, range); + } + const uint32_t level_count = + vk_image_subresource_level_count(&image->vk, range); + + if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { + transition_depth_buffer(cmd_buffer, image, + base_layer, layer_count, + img_barrier->oldLayout, + img_barrier->newLayout, + false /* will_full_fast_clear */); + } + + if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { + transition_stencil_buffer(cmd_buffer, image, + range->baseMipLevel, level_count, + base_layer, layer_count, + img_barrier->oldLayout, + img_barrier->newLayout, + false /* will_full_fast_clear */); + } + + if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) { + VkImageAspectFlags color_aspects = + vk_image_expand_aspect_mask(&image->vk, range->aspectMask); + anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) { + transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit, + range->baseMipLevel, level_count, + base_layer, layer_count, + img_barrier->oldLayout, + img_barrier->newLayout, + img_barrier->srcQueueFamilyIndex, + img_barrier->dstQueueFamilyIndex, + false /* will_full_fast_clear */); + } + } + } + + enum anv_pipe_bits bits = + anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) | + anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags); + + anv_add_pending_pipe_bits(cmd_buffer, bits, reason); +} + +void genX(CmdPipelineBarrier2)( + VkCommandBuffer commandBuffer, + const VkDependencyInfo* pDependencyInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier"); +} + +static void +cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer) +{ + VkShaderStageFlags stages = + cmd_buffer->state.gfx.pipeline->active_stages; + + /* In order to avoid thrash, we assume that vertex and fragment stages + * always exist. In the rare case where one is missing *and* the other + * uses push concstants, this may be suboptimal. However, avoiding stalls + * seems more important. + */ + stages |= VK_SHADER_STAGE_FRAGMENT_BIT; + if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline)) + stages |= VK_SHADER_STAGE_VERTEX_BIT; + + if (stages == cmd_buffer->state.gfx.push_constant_stages) + return; + + const unsigned push_constant_kb = + cmd_buffer->device->info->max_constant_urb_size_kb; + + const unsigned num_stages = + util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS); + unsigned size_per_stage = push_constant_kb / num_stages; + + /* Broadwell+ and Haswell gt3 require that the push constant sizes be in + * units of 2KB. Incidentally, these are the same platforms that have + * 32KB worth of push constant space. + */ + if (push_constant_kb == 32) + size_per_stage &= ~1u; + + uint32_t kb_used = 0; + for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) { + unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0; + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) { + alloc._3DCommandSubOpcode = 18 + i; + alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0; + alloc.ConstantBufferSize = push_size; + } + kb_used += push_size; + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) { + alloc.ConstantBufferOffset = kb_used; + alloc.ConstantBufferSize = push_constant_kb - kb_used; + } + +#if GFX_VERx10 == 125 + /* Wa_22011440098 + * + * In 3D mode, after programming push constant alloc command immediately + * program push constant command(ZERO length) without any commit between + * them. + */ + if (intel_device_info_is_dg2(cmd_buffer->device->info)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0); + } + } +#endif + + cmd_buffer->state.gfx.push_constant_stages = stages; + + /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS: + * + * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to + * the next 3DPRIMITIVE command after programming the + * 3DSTATE_PUSH_CONSTANT_ALLOC_VS" + * + * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of + * pipeline setup, we need to dirty push constants. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS; +} + +static VkResult +emit_binding_table(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipe_state, + struct anv_shader_bin *shader, + struct anv_state *bt_state) +{ + uint32_t state_offset; + + struct anv_pipeline_bind_map *map = &shader->bind_map; + if (map->surface_count == 0) { + *bt_state = (struct anv_state) { 0, }; + return VK_SUCCESS; + } + + *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, + map->surface_count, + &state_offset); + uint32_t *bt_map = bt_state->map; + + if (bt_state->map == NULL) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + /* We only need to emit relocs if we're not using softpin. If we are using + * softpin then we always keep all user-allocated memory objects resident. + */ + const bool need_client_mem_relocs = + anv_use_relocations(cmd_buffer->device->physical); + struct anv_push_constants *push = &pipe_state->push_constants; + + for (uint32_t s = 0; s < map->surface_count; s++) { + struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s]; + + struct anv_state surface_state; + + switch (binding->set) { + case ANV_DESCRIPTOR_SET_NULL: + bt_map[s] = 0; + break; + + case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: + /* Color attachment binding */ + assert(shader->stage == MESA_SHADER_FRAGMENT); + if (binding->index < cmd_buffer->state.gfx.color_att_count) { + const struct anv_attachment *att = + &cmd_buffer->state.gfx.color_att[binding->index]; + surface_state = att->surface_state.state; + } else { + surface_state = cmd_buffer->state.gfx.null_surface_state; + } + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + break; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: { + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); + + struct anv_address constant_data = { + .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, + .offset = shader->kernel.offset + + shader->prog_data->const_data_offset, + }; + unsigned constant_data_size = shader->prog_data->const_data_size; + + const enum isl_format format = + anv_isl_format_for_descriptor_type(cmd_buffer->device, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, ISL_SWIZZLE_IDENTITY, + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, + constant_data, constant_data_size, 1); + + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, surface_state, constant_data); + break; + } + + case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { + /* This is always the first binding for compute shaders */ + assert(shader->stage == MESA_SHADER_COMPUTE && s == 0); + + struct anv_state surface_state = + anv_cmd_buffer_alloc_surface_state(cmd_buffer); + + const enum isl_format format = + anv_isl_format_for_descriptor_type(cmd_buffer->device, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, ISL_SWIZZLE_IDENTITY, + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, + cmd_buffer->state.compute.num_workgroups, + 12, 1); + + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + cmd_buffer->state.compute.num_workgroups); + } + break; + } + + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is actually + * given by binding->binding. (Yes, that's confusing.) + */ + struct anv_descriptor_set *set = + pipe_state->descriptors[binding->index]; + assert(set->desc_mem.alloc_size); + assert(set->desc_surface_state.alloc_size); + bt_map[s] = set->desc_surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, set->desc_surface_state, + anv_descriptor_set_address(set)); + break; + } + + default: { + assert(binding->set < MAX_SETS); + const struct anv_descriptor_set *set = + pipe_state->descriptors[binding->set]; + if (binding->index >= set->descriptor_count) { + /* From the Vulkan spec section entitled "DescriptorSet and + * Binding Assignment": + * + * "If the array is runtime-sized, then array elements greater + * than or equal to the size of that binding in the bound + * descriptor set must not be used." + * + * Unfortunately, the compiler isn't smart enough to figure out + * when a dynamic binding isn't used so it may grab the whole + * array and stick it in the binding table. In this case, it's + * safe to just skip those bindings that are OOB. + */ + assert(binding->index < set->layout->descriptor_count); + continue; + } + const struct anv_descriptor *desc = &set->descriptors[binding->index]; + + switch (desc->type) { + case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: + case VK_DESCRIPTOR_TYPE_SAMPLER: + /* Nothing for us to do here */ + continue; + + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { + if (desc->image_view) { + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + desc->image_view->planes[binding->plane].general_sampler_surface_state : + desc->image_view->planes[binding->plane].optimal_sampler_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + } + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + if (desc->image_view) { + struct anv_surface_state sstate = + binding->lowered_storage_surface + ? desc->image_view->planes[binding->plane].lowered_storage_surface_state + : desc->image_view->planes[binding->plane].storage_surface_state; + surface_state = sstate.state; + assert(surface_state.alloc_size); + if (surface_state.offset == 0) { + mesa_loge("Bound a image to a descriptor where the " + "descriptor does not have NonReadable " + "set and the image does not have a " + "corresponding SPIR-V format enum."); + vk_debug_report(&cmd_buffer->device->physical->instance->vk, + VK_DEBUG_REPORT_ERROR_BIT_EXT, + &desc->image_view->vk.base, + __LINE__, 0, "anv", + "Bound a image to a descriptor where the " + "descriptor does not have NonReadable " + "set and the image does not have a " + "corresponding SPIR-V format enum."); + } + if (surface_state.offset && need_client_mem_relocs) + add_surface_state_relocs(cmd_buffer, sstate); + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + } + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + if (desc->set_buffer_view) { + surface_state = desc->set_buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->set_buffer_view->address); + } + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + if (desc->buffer_view) { + surface_state = desc->buffer_view->surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + if (desc->buffer) { + /* Compute the offset within the buffer */ + uint32_t dynamic_offset = + push->dynamic_offsets[binding->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->vk.size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset); + + /* Align the range for consistency */ + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) + range = align_u32(range, ANV_UBO_ALIGNMENT); + + struct anv_address address = + anv_address_add(desc->buffer->address, offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(cmd_buffer->device, + desc->type); + + isl_surf_usage_flags_t usage = + desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ? + ISL_SURF_USAGE_CONSTANT_BUFFER_BIT : + ISL_SURF_USAGE_STORAGE_BIT; + + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, ISL_SWIZZLE_IDENTITY, + usage, address, range, 1); + if (need_client_mem_relocs) + add_surface_reloc(cmd_buffer, surface_state, address); + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + } + + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + if (desc->buffer_view) { + surface_state = binding->lowered_storage_surface + ? desc->buffer_view->lowered_storage_surface_state + : desc->buffer_view->storage_surface_state; + assert(surface_state.alloc_size); + if (need_client_mem_relocs) { + add_surface_reloc(cmd_buffer, surface_state, + desc->buffer_view->address); + } + } else { + surface_state = cmd_buffer->device->null_surface_state; + } + break; + + default: + assert(!"Invalid descriptor type"); + continue; + } + assert(surface_state.map); + bt_map[s] = surface_state.offset + state_offset; + break; + } + } + } + + return VK_SUCCESS; +} + +static VkResult +emit_samplers(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipe_state, + struct anv_shader_bin *shader, + struct anv_state *state) +{ + struct anv_pipeline_bind_map *map = &shader->bind_map; + if (map->sampler_count == 0) { + *state = (struct anv_state) { 0, }; + return VK_SUCCESS; + } + + uint32_t size = map->sampler_count * 16; + *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32); + + if (state->map == NULL) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + + for (uint32_t s = 0; s < map->sampler_count; s++) { + struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s]; + const struct anv_descriptor *desc = + &pipe_state->descriptors[binding->set]->descriptors[binding->index]; + + if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER && + desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + continue; + + struct anv_sampler *sampler = desc->sampler; + + /* This can happen if we have an unfilled slot since TYPE_SAMPLER + * happens to be zero. + */ + if (sampler == NULL) + continue; + + memcpy(state->map + (s * 16), + sampler->state[binding->plane], sizeof(sampler->state[0])); + } + + return VK_SUCCESS; +} + +static uint32_t +flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer, + struct anv_cmd_pipeline_state *pipe_state, + const VkShaderStageFlags dirty, + struct anv_shader_bin **shaders, + uint32_t num_shaders) +{ + VkShaderStageFlags flushed = 0; + + VkResult result = VK_SUCCESS; + for (uint32_t i = 0; i < num_shaders; i++) { + if (!shaders[i]) + continue; + + gl_shader_stage stage = shaders[i]->stage; + VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage); + if ((vk_stage & dirty) == 0) + continue; + + assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers)); + result = emit_samplers(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.samplers[stage]); + if (result != VK_SUCCESS) + break; + + assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables)); + result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.binding_tables[stage]); + if (result != VK_SUCCESS) + break; + + flushed |= vk_stage; + } + + if (result != VK_SUCCESS) { + assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY); + + result = anv_cmd_buffer_new_binding_table_block(cmd_buffer); + if (result != VK_SUCCESS) + return 0; + + /* Re-emit state base addresses so we get the new surface state base + * address before we start emitting binding tables etc. + */ + genX(cmd_buffer_emit_state_base_address)(cmd_buffer); + + /* Re-emit all active binding tables */ + flushed = 0; + + for (uint32_t i = 0; i < num_shaders; i++) { + if (!shaders[i]) + continue; + + gl_shader_stage stage = shaders[i]->stage; + + result = emit_samplers(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.samplers[stage]); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return 0; + } + result = emit_binding_table(cmd_buffer, pipe_state, shaders[i], + &cmd_buffer->state.binding_tables[stage]); + if (result != VK_SUCCESS) { + anv_batch_set_error(&cmd_buffer->batch, result); + return 0; + } + + flushed |= mesa_to_vk_shader_stage(stage); + } + } + + return flushed; +} + +static void +cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer, + uint32_t stages) +{ + static const uint32_t sampler_state_opcodes[] = { + [MESA_SHADER_VERTEX] = 43, + [MESA_SHADER_TESS_CTRL] = 44, /* HS */ + [MESA_SHADER_TESS_EVAL] = 45, /* DS */ + [MESA_SHADER_GEOMETRY] = 46, + [MESA_SHADER_FRAGMENT] = 47, + }; + + static const uint32_t binding_table_opcodes[] = { + [MESA_SHADER_VERTEX] = 38, + [MESA_SHADER_TESS_CTRL] = 39, + [MESA_SHADER_TESS_EVAL] = 40, + [MESA_SHADER_GEOMETRY] = 41, + [MESA_SHADER_FRAGMENT] = 42, + }; + + anv_foreach_stage(s, stages) { + assert(s < ARRAY_SIZE(binding_table_opcodes)); + + if (cmd_buffer->state.samplers[s].alloc_size > 0) { + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) { + ssp._3DCommandSubOpcode = sampler_state_opcodes[s]; + ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset; + } + } + + /* Always emit binding table pointers if we're asked to, since on SKL + * this is what flushes push constants. */ + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) { + btp._3DCommandSubOpcode = binding_table_opcodes[s]; + btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset; + } + } +} + +static struct anv_address +get_push_range_address(struct anv_cmd_buffer *cmd_buffer, + const struct anv_shader_bin *shader, + const struct anv_push_range *range) +{ + struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + return anv_descriptor_set_address(set); + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: { + if (gfx_state->base.push_constants_state.alloc_size == 0) { + gfx_state->base.push_constants_state = + anv_cmd_buffer_gfx_push_constants(cmd_buffer); + } + return (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = gfx_state->base.push_constants_state.offset, + }; + } + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: + return (struct anv_address) { + .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo, + .offset = shader->kernel.offset + + shader->prog_data->const_data_offset, + }; + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + if (desc->buffer_view) + return desc->buffer_view->address; + } else { + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + if (desc->buffer) { + const struct anv_push_constants *push = + &gfx_state->base.push_constants; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + return anv_address_add(desc->buffer->address, + desc->offset + dynamic_offset); + } + } + + /* For NULL UBOs, we just return an address in the workaround BO. We do + * writes to it for workarounds but always at the bottom. The higher + * bytes should be all zeros. + */ + assert(range->length * 32 <= 2048); + return (struct anv_address) { + .bo = cmd_buffer->device->workaround_bo, + .offset = 1024, + }; + } + } +} + + +/** Returns the size in bytes of the bound buffer + * + * The range is relative to the start of the buffer, not the start of the + * range. The returned range may be smaller than + * + * (range->start + range->length) * 32; + */ +static uint32_t +get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer, + const struct anv_shader_bin *shader, + const struct anv_push_range *range) +{ + assert(shader->stage != MESA_SHADER_COMPUTE); + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + switch (range->set) { + case ANV_DESCRIPTOR_SET_DESCRIPTORS: { + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->index]; + assert(range->start * 32 < set->desc_mem.alloc_size); + assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size); + return set->desc_mem.alloc_size; + } + + case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: + return (range->start + range->length) * 32; + + case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: + return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT); + + default: { + assert(range->set < MAX_SETS); + struct anv_descriptor_set *set = + gfx_state->base.descriptors[range->set]; + const struct anv_descriptor *desc = + &set->descriptors[range->index]; + + if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) { + /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection. + * We use the descriptor set's internally allocated surface state to fill the binding table entry. + */ + if (!desc->set_buffer_view) + return 0; + + if (range->start * 32 > desc->set_buffer_view->range) + return 0; + + return desc->set_buffer_view->range; + } else { + if (!desc->buffer) + return 0; + + assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC); + /* Compute the offset within the buffer */ + const struct anv_push_constants *push = + &gfx_state->base.push_constants; + uint32_t dynamic_offset = + push->dynamic_offsets[range->dynamic_offset_index]; + uint64_t offset = desc->offset + dynamic_offset; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->vk.size); + /* Clamp the range to the buffer size */ + uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset); + + /* Align the range for consistency */ + bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT); + + return bound_range; + } + } + } +} + +static void +cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer, + gl_shader_stage stage, + struct anv_address *buffers, + unsigned buffer_count) +{ + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + + static const uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + }; + + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + + UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) { + c._3DCommandSubOpcode = push_constant_opcodes[stage]; + + /* Set MOCS, except on Gfx8, because the Broadwell PRM says: + * + * "Constant Buffer Object Control State must be always + * programmed to zero." + * + * This restriction does not exist on any newer platforms. + * + * We only have one MOCS field for the whole packet, not one per + * buffer. We could go out of our way here to walk over all of + * the buffers and see if any of them are used externally and use + * the external MOCS. However, the notion that someone would use + * the same bit of memory for both scanout and a UBO is nuts. + * + * Let's not bother and assume it's all internal. + */ +#if GFX_VER >= 9 + c.MOCS = mocs; +#elif GFX_VER < 8 + c.ConstantBody.MOCS = mocs; +#endif + + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + +#if GFX_VERx10 >= 75 + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur + * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with + * buffer 3 read length equal to zero committed followed by a + * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to + * zero committed." + * + * To avoid this, we program the buffers in the highest slots. + * This way, slot 0 is only used if slot 3 is also used. + */ + assert(buffer_count <= 4); + const unsigned shift = 4 - buffer_count; + for (unsigned i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + + /* At this point we only have non-empty ranges */ + assert(range->length > 0); + + /* For Ivy Bridge, make sure we only set the first range (actual + * push constants) + */ + assert((GFX_VERx10 >= 75) || i == 0); + + c.ConstantBody.ReadLength[i + shift] = range->length; + c.ConstantBody.Buffer[i + shift] = + anv_address_add(buffers[i], range->start * 32); + } +#else + /* For Ivy Bridge, push constants are relative to dynamic state + * base address and we only ever push actual push constants. + */ + if (bind_map->push_ranges[0].length > 0) { + assert(buffer_count == 1); + assert(bind_map->push_ranges[0].set == + ANV_DESCRIPTOR_SET_PUSH_CONSTANTS); + assert(buffers[0].bo == + cmd_buffer->device->dynamic_state_pool.block_pool.bo); + c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length; + c.ConstantBody.Buffer[0].bo = NULL; + c.ConstantBody.Buffer[0].offset = buffers[0].offset; + } + assert(bind_map->push_ranges[1].length == 0); + assert(bind_map->push_ranges[2].length == 0); + assert(bind_map->push_ranges[3].length == 0); +#endif + } + } +} + +#if GFX_VER >= 12 +static void +cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer, + uint32_t shader_mask, + struct anv_address *buffers, + uint32_t buffer_count) +{ + if (buffer_count == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) { + c.ShaderUpdateEnable = shader_mask; + c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false); + } + return; + } + + const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + + static const UNUSED uint32_t push_constant_opcodes[] = { + [MESA_SHADER_VERTEX] = 21, + [MESA_SHADER_TESS_CTRL] = 25, /* HS */ + [MESA_SHADER_TESS_EVAL] = 26, /* DS */ + [MESA_SHADER_GEOMETRY] = 22, + [MESA_SHADER_FRAGMENT] = 23, + }; + + gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask); + assert(stage < ARRAY_SIZE(push_constant_opcodes)); + + const struct anv_pipeline_bind_map *bind_map = + &pipeline->shaders[stage]->bind_map; + + uint32_t *dw; + const uint32_t buffer_mask = (1 << buffer_count) - 1; + const uint32_t num_dwords = 2 + 2 * buffer_count; + + dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_CONSTANT_ALL), + .ShaderUpdateEnable = shader_mask, + .PointerBufferMask = buffer_mask, + .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false)); + + for (int i = 0; i < buffer_count; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + GENX(3DSTATE_CONSTANT_ALL_DATA_pack)( + &cmd_buffer->batch, dw + 2 + i * 2, + &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) { + .PointerToConstantBuffer = + anv_address_add(buffers[i], range->start * 32), + .ConstantBufferReadLength = range->length, + }); + } +} +#endif + +static void +cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, + VkShaderStageFlags dirty_stages) +{ + VkShaderStageFlags flushed = 0; + struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + +#if GFX_VER >= 12 + uint32_t nobuffer_stages = 0; +#endif + + /* Compute robust pushed register access mask for each stage. */ + if (cmd_buffer->device->robust_buffer_access) { + anv_foreach_stage(stage, dirty_stages) { + if (!anv_pipeline_has_stage(pipeline, stage)) + continue; + + const struct anv_shader_bin *shader = pipeline->shaders[stage]; + const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; + struct anv_push_constants *push = &gfx_state->base.push_constants; + + push->push_reg_mask[stage] = 0; + /* Start of the current range in the shader, relative to the start of + * push constants in the shader. + */ + unsigned range_start_reg = 0; + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + if (range->length == 0) + continue; + + unsigned bound_size = + get_push_range_bound_size(cmd_buffer, shader, range); + if (bound_size >= range->start * 32) { + unsigned bound_regs = + MIN2(DIV_ROUND_UP(bound_size, 32) - range->start, + range->length); + assert(range_start_reg + bound_regs <= 64); + push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg, + bound_regs); + } + + cmd_buffer->state.push_constants_dirty |= + mesa_to_vk_shader_stage(stage); + + range_start_reg += range->length; + } + } + } + + /* Resets the push constant state so that we allocate a new one if + * needed. + */ + gfx_state->base.push_constants_state = ANV_STATE_NULL; + + anv_foreach_stage(stage, dirty_stages) { + unsigned buffer_count = 0; + flushed |= mesa_to_vk_shader_stage(stage); + UNUSED uint32_t max_push_range = 0; + + struct anv_address buffers[4] = {}; + if (anv_pipeline_has_stage(pipeline, stage)) { + const struct anv_shader_bin *shader = pipeline->shaders[stage]; + const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; + + /* We have to gather buffer addresses as a second step because the + * loop above puts data into the push constant area and the call to + * get_push_range_address is what locks our push constants and copies + * them into the actual GPU buffer. If we did the two loops at the + * same time, we'd risk only having some of the sizes in the push + * constant buffer when we did the copy. + */ + for (unsigned i = 0; i < 4; i++) { + const struct anv_push_range *range = &bind_map->push_ranges[i]; + if (range->length == 0) + break; + + buffers[i] = get_push_range_address(cmd_buffer, shader, range); + max_push_range = MAX2(max_push_range, range->length); + buffer_count++; + } + + /* We have at most 4 buffers but they should be tightly packed */ + for (unsigned i = buffer_count; i < 4; i++) + assert(bind_map->push_ranges[i].length == 0); + } + +#if GFX_VER >= 12 + /* If this stage doesn't have any push constants, emit it later in a + * single CONSTANT_ALL packet. + */ + if (buffer_count == 0) { + nobuffer_stages |= 1 << stage; + continue; + } + + /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL + * contains only 5 bits, so we can only use it for buffers smaller than + * 32. + */ + if (max_push_range < 32) { + cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage, + buffers, buffer_count); + continue; + } +#endif + + cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count); + } + +#if GFX_VER >= 12 + if (nobuffer_stages) + cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0); +#endif + + cmd_buffer->state.push_constants_dirty &= ~flushed; +} + +#if GFX_VERx10 >= 125 +static void +cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, + VkShaderStageFlags dirty_stages) +{ + struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx; + const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline; + + if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV && + anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { + + const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK]; + const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) { + const struct anv_push_range *range = &bind_map->push_ranges[0]; + if (range->length > 0) { + struct anv_address buffer = + get_push_range_address(cmd_buffer, shader, range); + + uint64_t addr = anv_address_physical(buffer); + data.InlineData[0] = addr & 0xffffffff; + data.InlineData[1] = addr >> 32; + + memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW], + cmd_buffer->state.gfx.base.push_constants.client_data, + BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4); + } + } + } + + if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV && + anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) { + + const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH]; + const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) { + const struct anv_push_range *range = &bind_map->push_ranges[0]; + if (range->length > 0) { + struct anv_address buffer = + get_push_range_address(cmd_buffer, shader, range); + + uint64_t addr = anv_address_physical(buffer); + data.InlineData[0] = addr & 0xffffffff; + data.InlineData[1] = addr >> 32; + + memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW], + cmd_buffer->state.gfx.base.push_constants.client_data, + BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4); + } + } + } + + cmd_buffer->state.push_constants_dirty &= ~dirty_stages; +} +#endif + +static void +cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer) +{ + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + + if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) && + !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) && +#if GFX_VER <= 7 + !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) && + !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) && +#endif + !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT)) + return; + + /* Take dynamic primitive topology in to account with + * 3DSTATE_CLIP::ViewportXYClipTestEnable + */ + VkPolygonMode dynamic_raster_mode = + genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, + dyn->ia.primitive_topology); + bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL); + + struct GENX(3DSTATE_CLIP) clip = { + GENX(3DSTATE_CLIP_header), +#if GFX_VER <= 7 + .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face], + .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode], +#endif + .ViewportXYClipTestEnable = xy_clip_test_enable, + }; + uint32_t dwords[GENX(3DSTATE_CLIP_length)]; + + /* TODO(mesh): Multiview. */ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_prog_data *last = + anv_pipeline_get_last_vue_prog_data(pipeline); + if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { + clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ? + dyn->vp.viewport_count - 1 : 0; + } + } else if (anv_pipeline_is_mesh(pipeline)) { + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) { + clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ? + dyn->vp.viewport_count - 1 : 0; + } + } + + GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip); + anv_batch_emit_merge(&cmd_buffer->batch, dwords, + pipeline->gfx7.clip); +} + +static void +cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t count = dyn->vp.viewport_count; + const VkViewport *viewports = dyn->vp.viewports; + struct anv_state sf_clip_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64); + + bool negative_one_to_one = + cmd_buffer->state.gfx.pipeline->negative_one_to_one; + + float scale = negative_one_to_one ? 0.5f : 1.0f; + + for (uint32_t i = 0; i < count; i++) { + const VkViewport *vp = &viewports[i]; + + /* The gfx7 state struct has just the matrix and guardband fields, the + * gfx8 struct adds the min/max viewport fields. */ + struct GENX(SF_CLIP_VIEWPORT) sfv = { + .ViewportMatrixElementm00 = vp->width / 2, + .ViewportMatrixElementm11 = vp->height / 2, + .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale, + .ViewportMatrixElementm30 = vp->x + vp->width / 2, + .ViewportMatrixElementm31 = vp->y + vp->height / 2, + .ViewportMatrixElementm32 = negative_one_to_one ? + (vp->minDepth + vp->maxDepth) * scale : vp->minDepth, + .XMinClipGuardband = -1.0f, + .XMaxClipGuardband = 1.0f, + .YMinClipGuardband = -1.0f, + .YMaxClipGuardband = 1.0f, +#if GFX_VER >= 8 + .XMinViewPort = vp->x, + .XMaxViewPort = vp->x + vp->width - 1, + .YMinViewPort = MIN2(vp->y, vp->y + vp->height), + .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1, +#endif + }; + + const uint32_t fb_size_max = 1 << 14; + uint32_t x_min = 0, x_max = fb_size_max; + uint32_t y_min = 0, y_max = fb_size_max; + + /* If we have a valid renderArea, include that */ + if (gfx->render_area.extent.width > 0 && + gfx->render_area.extent.height > 0) { + x_min = MAX2(x_min, gfx->render_area.offset.x); + x_max = MIN2(x_min, gfx->render_area.offset.x + + gfx->render_area.extent.width); + y_min = MAX2(y_min, gfx->render_area.offset.y); + y_max = MIN2(y_min, gfx->render_area.offset.y + + gfx->render_area.extent.height); + } + + /* The client is required to have enough scissors for whatever it sets + * as ViewportIndex but it's possible that they've got more viewports + * set from a previous command. Also, from the Vulkan 1.3.207: + * + * "The application must ensure (using scissor if necessary) that + * all rendering is contained within the render area." + * + * If the client doesn't set a scissor, that basically means it + * guarantees everything is in-bounds already. If we end up using a + * guardband of [-1, 1] in that case, there shouldn't be much loss. + * It's theoretically possible that they could do all their clipping + * with clip planes but that'd be a bit odd. + */ + if (i < dyn->vp.scissor_count) { + const VkRect2D *scissor = &dyn->vp.scissors[i]; + x_min = MAX2(x_min, scissor->offset.x); + x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width); + y_min = MAX2(y_min, scissor->offset.y); + y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height); + } + + /* Only bother calculating the guardband if our known render area is + * less than the maximum size. Otherwise, it will calculate [-1, 1] + * anyway but possibly with precision loss. + */ + if (x_min > 0 || x_max < fb_size_max || + y_min > 0 || y_max < fb_size_max) { + intel_calculate_guardband_size(x_min, x_max, y_min, y_max, + sfv.ViewportMatrixElementm00, + sfv.ViewportMatrixElementm11, + sfv.ViewportMatrixElementm30, + sfv.ViewportMatrixElementm31, + &sfv.XMinClipGuardband, + &sfv.XMaxClipGuardband, + &sfv.YMinClipGuardband, + &sfv.YMaxClipGuardband); + } + + GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv); + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) { + clip.SFClipViewportPointer = sf_clip_state.offset; + } +} + +static void +cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer, + bool depth_clamp_enable) +{ + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t count = dyn->vp.viewport_count; + const VkViewport *viewports = dyn->vp.viewports; + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32); + + for (uint32_t i = 0; i < count; i++) { + const VkViewport *vp = &viewports[i]; + + /* From the Vulkan spec: + * + * "It is valid for minDepth to be greater than or equal to + * maxDepth." + */ + float min_depth = MIN2(vp->minDepth, vp->maxDepth); + float max_depth = MAX2(vp->minDepth, vp->maxDepth); + + struct GENX(CC_VIEWPORT) cc_viewport = { + .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f, + .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f, + }; + + GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport); + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) { + cc.CCViewportPointer = cc_state.offset; + } +} + +static int64_t +clamp_int64(int64_t x, int64_t min, int64_t max) +{ + if (x < min) + return min; + else if (x < max) + return x; + else + return max; +} + +static void +cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t count = dyn->vp.scissor_count; + const VkRect2D *scissors = dyn->vp.scissors; + const VkViewport *viewports = dyn->vp.viewports; + + /* Wa_1409725701: + * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is + * stored as an array of up to 16 elements. The location of first + * element of the array, as specified by Pointer to SCISSOR_RECT, should + * be aligned to a 64-byte boundary. + */ + uint32_t alignment = 64; + struct anv_state scissor_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment); + + for (uint32_t i = 0; i < count; i++) { + const VkRect2D *s = &scissors[i]; + const VkViewport *vp = &viewports[i]; + + /* Since xmax and ymax are inclusive, we have to have xmax < xmin or + * ymax < ymin for empty clips. In case clip x, y, width height are all + * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't + * what we want. Just special case empty clips and produce a canonical + * empty clip. */ + static const struct GENX(SCISSOR_RECT) empty_scissor = { + .ScissorRectangleYMin = 1, + .ScissorRectangleXMin = 1, + .ScissorRectangleYMax = 0, + .ScissorRectangleXMax = 0 + }; + + const int max = 0xffff; + + uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height)); + uint32_t x_min = MAX2(s->offset.x, vp->x); + uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1, + MAX2(vp->y, vp->y + vp->height) - 1); + uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1, + vp->x + vp->width - 1); + + /* Do this math using int64_t so overflow gets clamped correctly. */ + if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max); + x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max); + y_max = clamp_int64((uint64_t) y_max, 0, + gfx->render_area.offset.y + + gfx->render_area.extent.height - 1); + x_max = clamp_int64((uint64_t) x_max, 0, + gfx->render_area.offset.x + + gfx->render_area.extent.width - 1); + } + + struct GENX(SCISSOR_RECT) scissor = { + .ScissorRectangleYMin = y_min, + .ScissorRectangleXMin = x_min, + .ScissorRectangleYMax = y_max, + .ScissorRectangleXMax = x_max + }; + + if (s->extent.width <= 0 || s->extent.height <= 0) { + GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, + &empty_scissor); + } else { + GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor); + } + } + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) { + ssp.ScissorRectPointer = scissor_state.offset; + } +} + +static void +cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer) +{ + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + +#if GFX_VER == 7 +# define streamout_state_dw pipeline->gfx7.streamout_state +#else +# define streamout_state_dw pipeline->gfx8.streamout_state +#endif + + uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)]; + + struct GENX(3DSTATE_STREAMOUT) so = { + GENX(3DSTATE_STREAMOUT_header), + .RenderingDisable = dyn->rs.rasterizer_discard_enable, + }; + GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so); + anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw); +} + +void +genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + uint32_t *p; + + assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0); + + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); + + genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1); + + genX(flush_pipeline_select_3d)(cmd_buffer); + + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used; + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) + vb_emit |= pipeline->vb_used; + + if (vb_emit) { + const uint32_t num_buffers = __builtin_popcount(vb_emit); + const uint32_t num_dwords = 1 + num_buffers * 4; + + p = anv_batch_emitn(&cmd_buffer->batch, num_dwords, + GENX(3DSTATE_VERTEX_BUFFERS)); + uint32_t i = 0; + u_foreach_bit(vb, vb_emit) { + struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer; + uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset; + + struct GENX(VERTEX_BUFFER_STATE) state; + if (buffer) { + uint32_t stride = dyn->vi_binding_strides[vb]; + UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size; + +#if GFX_VER <= 7 + bool per_instance = pipeline->vb[vb].instanced; + uint32_t divisor = pipeline->vb[vb].instance_divisor * + pipeline->instance_multiplier; +#endif + + state = (struct GENX(VERTEX_BUFFER_STATE)) { + .VertexBufferIndex = vb, + + .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo, + ISL_SURF_USAGE_VERTEX_BUFFER_BIT), +#if GFX_VER <= 7 + .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA, + .InstanceDataStepRate = per_instance ? divisor : 1, +#endif + .AddressModifyEnable = true, + .BufferPitch = stride, + .BufferStartingAddress = anv_address_add(buffer->address, offset), + .NullVertexBuffer = offset >= buffer->vk.size, +#if GFX_VER >= 12 + .L3BypassDisable = true, +#endif + +#if GFX_VER >= 8 + .BufferSize = size, +#else + /* XXX: to handle dynamic offset for older gens we might want + * to modify Endaddress, but there are issues when doing so: + * + * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439 + */ + .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1), +#endif + }; + } else { + state = (struct GENX(VERTEX_BUFFER_STATE)) { + .VertexBufferIndex = vb, + .NullVertexBuffer = true, + .MOCS = anv_mocs(cmd_buffer->device, NULL, + ISL_SURF_USAGE_VERTEX_BUFFER_BIT), + }; + } + +#if GFX_VER >= 8 && GFX_VER <= 9 + genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb, + state.BufferStartingAddress, + state.BufferSize); +#endif + + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state); + i++; + } + } + + cmd_buffer->state.gfx.vb_dirty &= ~vb_emit; + + uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty & + pipeline->active_stages; + if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty && + !vk_dynamic_graphics_state_any_dirty(dyn) && + !cmd_buffer->state.push_constants_dirty) + return; + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) || + (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty & + ANV_CMD_DIRTY_PIPELINE))) { + /* Wa_16011411144: + * + * SW must insert a PIPE_CONTROL cmd before and after the + * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* + * state is not combined with other state changes. + */ + if (intel_device_info_is_dg2(cmd_buffer->device->info)) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "before SO_BUFFER change WA"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + + /* We don't need any per-buffer dirty tracking because you're not + * allowed to bind different XFB buffers while XFB is enabled. + */ + for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx]; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GFX_VER < 12 + sob.SOBufferIndex = idx; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx; +#endif + + if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) { + sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0); + sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address, + xfb->offset); +#if GFX_VER >= 8 + sob.SOBufferEnable = true; + sob.StreamOffsetWriteEnable = false; + /* Size is in DWords - 1 */ + sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1; +#else + /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so + * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the + * default for an empty SO_BUFFER packet) to disable them. + */ + sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx]; + sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address, + xfb->offset + xfb->size); +#endif + } else { + sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0); + } + } + } + + if (intel_device_info_is_dg2(cmd_buffer->device->info)) { + /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "after SO_BUFFER change WA"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } else if (GFX_VER >= 10) { + /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "after 3DSTATE_SO_BUFFER call"); + } + } + + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) { + anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); + + /* If the pipeline changed, we may need to re-allocate push constant + * space in the URB. + */ + cmd_buffer_alloc_push_constants(cmd_buffer); + } + +#if GFX_VER <= 7 + if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT || + cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) { + /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: + * + * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth + * stall needs to be sent just prior to any 3DSTATE_VS, + * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS, + * 3DSTATE_BINDING_TABLE_POINTER_VS, + * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one + * PIPE_CONTROL needs to be sent before any combination of VS + * associated 3DSTATE." + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DepthStallEnable = true; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = cmd_buffer->device->workaround_address; + anv_debug_dump_pc(pc); + } + } +#endif + + /* Render targets live in the same binding table as fragment descriptors */ + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) + descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT; + + /* We emit the binding tables and sampler tables first, then emit push + * constants and then finally emit binding table and sampler table + * pointers. It has to happen in this order, since emitting the binding + * tables may change the push constants (in case of storage images). After + * emitting push constants, on SKL+ we have to emit the corresponding + * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect. + */ + uint32_t dirty = 0; + if (descriptors_dirty) { + dirty = flush_descriptor_sets(cmd_buffer, + &cmd_buffer->state.gfx.base, + descriptors_dirty, + pipeline->shaders, + ARRAY_SIZE(pipeline->shaders)); + cmd_buffer->state.descriptors_dirty &= ~dirty; + } + + if (dirty || cmd_buffer->state.push_constants_dirty) { + /* Because we're pushing UBOs, we have to push whenever either + * descriptors or push constants is dirty. + */ + dirty |= cmd_buffer->state.push_constants_dirty; + cmd_buffer_flush_push_constants(cmd_buffer, + dirty & VK_SHADER_STAGE_ALL_GRAPHICS); +#if GFX_VERx10 >= 125 + cmd_buffer_flush_mesh_inline_data( + cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV | + VK_SHADER_STAGE_MESH_BIT_NV)); +#endif + } + + if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) { + cmd_buffer_emit_descriptor_pointers(cmd_buffer, + dirty & VK_SHADER_STAGE_ALL_GRAPHICS); + } + + cmd_buffer_emit_clip(cmd_buffer); + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_XFB_ENABLE)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE)) + cmd_buffer_emit_streamout(cmd_buffer); + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) { + cmd_buffer_emit_viewport(cmd_buffer); + cmd_buffer_emit_depth_viewport(cmd_buffer, + pipeline->depth_clamp_enable); + cmd_buffer_emit_scissor(cmd_buffer); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) { + uint32_t topology; + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) + topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points); + else + topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology]; + + cmd_buffer->state.gfx.primitive_topology = topology; + +#if (GFX_VER >= 8) + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) { + vft.PrimitiveTopologyType = topology; + } +#endif + } + + genX(cmd_buffer_flush_dynamic_state)(cmd_buffer); +} + +static void +emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + uint32_t size, uint32_t index) +{ + uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5, + GENX(3DSTATE_VERTEX_BUFFERS)); + + GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1, + &(struct GENX(VERTEX_BUFFER_STATE)) { + .VertexBufferIndex = index, + .AddressModifyEnable = true, + .BufferPitch = 0, + .MOCS = anv_mocs(cmd_buffer->device, addr.bo, + ISL_SURF_USAGE_VERTEX_BUFFER_BIT), + .NullVertexBuffer = size == 0, +#if GFX_VER >= 12 + .L3BypassDisable = true, +#endif +#if (GFX_VER >= 8) + .BufferStartingAddress = addr, + .BufferSize = size +#else + .BufferStartingAddress = addr, + .EndAddress = anv_address_add(addr, size), +#endif + }); + + genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, + index, addr, size); +} + +static void +emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr) +{ + emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX); +} + +static void +emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer, + uint32_t base_vertex, uint32_t base_instance) +{ + if (base_vertex == 0 && base_instance == 0) { + emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS); + } else { + struct anv_state id_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4); + + ((uint32_t *)id_state.map)[0] = base_vertex; + ((uint32_t *)id_state.map)[1] = base_instance; + + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = id_state.offset, + }; + + emit_base_vertex_instance_bo(cmd_buffer, addr); + } +} + +static void +emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index) +{ + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4); + + ((uint32_t *)state.map)[0] = draw_index; + + struct anv_address addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + + emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX); +} + +static void +update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + uint64_t vb_used = pipeline->vb_used; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + vb_used |= 1ull << ANV_SVGS_VB_INDEX; + if (vs_prog_data->uses_drawid) + vb_used |= 1ull << ANV_DRAWID_VB_INDEX; + + genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, + access_type == RANDOM, + vb_used); +} + +ALWAYS_INLINE static void +cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer, + const struct brw_vs_prog_data *vs_prog_data, + uint32_t base_vertex, + uint32_t base_instance, + uint32_t draw_id, + bool force_flush) +{ + bool emitted = false; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) { + emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance); + emitted = true; + } + if (vs_prog_data->uses_drawid) { + emit_draw_index(cmd_buffer, draw_id); + emitted = true; + } + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + if (emitted || force_flush) + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +} + +void genX(CmdDraw)( + VkCommandBuffer commandBuffer, + uint32_t vertexCount, + uint32_t instanceCount, + uint32_t firstVertex, + uint32_t firstInstance) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + const uint32_t count = + vertexCount * instanceCount * pipeline->instance_multiplier; + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw", count); + trace_intel_begin_draw(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, + firstVertex, firstInstance, 0, + true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = vertexCount; + prim.StartVertexLocation = firstVertex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = 0; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); + + trace_intel_end_draw(&cmd_buffer->trace, count); +} + +void genX(CmdDrawMultiEXT)( + VkCommandBuffer commandBuffer, + uint32_t drawCount, + const VkMultiDrawInfoEXT *pVertexInfo, + uint32_t instanceCount, + uint32_t firstInstance, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + const uint32_t count = + drawCount * instanceCount * pipeline->instance_multiplier; + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw_multi", count); + trace_intel_begin_draw_multi(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + uint32_t i = 0; + vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) { + cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, + draw->firstVertex, + firstInstance, i, !i); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = draw->vertexCount; + prim.StartVertexLocation = draw->firstVertex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = 0; + } + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); + + trace_intel_end_draw_multi(&cmd_buffer->trace, count); +} + +void genX(CmdDrawIndexed)( + VkCommandBuffer commandBuffer, + uint32_t indexCount, + uint32_t instanceCount, + uint32_t firstIndex, + int32_t vertexOffset, + uint32_t firstInstance) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + const uint32_t count = + indexCount * instanceCount * pipeline->instance_multiplier; + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed", + count); + trace_intel_begin_draw_indexed(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = indexCount; + prim.StartVertexLocation = firstIndex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = vertexOffset; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); + + trace_intel_end_draw_indexed(&cmd_buffer->trace, count); +} + +void genX(CmdDrawMultiIndexedEXT)( + VkCommandBuffer commandBuffer, + uint32_t drawCount, + const VkMultiDrawIndexedInfoEXT *pIndexInfo, + uint32_t instanceCount, + uint32_t firstInstance, + uint32_t stride, + const int32_t *pVertexOffset) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + const uint32_t count = + drawCount * instanceCount * pipeline->instance_multiplier; + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed_multi", + count); + trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + uint32_t i = 0; + if (pVertexOffset) { + if (vs_prog_data->uses_drawid) { + bool emitted = true; + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) { + emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); + emitted = true; + } + vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { + if (vs_prog_data->uses_drawid) { + emit_draw_index(cmd_buffer, i); + emitted = true; + } + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + if (emitted) + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = draw->indexCount; + prim.StartVertexLocation = draw->firstIndex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = *pVertexOffset; + } + emitted = false; + } + } else { + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) { + emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance); + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = draw->indexCount; + prim.StartVertexLocation = draw->firstIndex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = *pVertexOffset; + } + } + } + } else { + vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) { + cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, + draw->vertexOffset, + firstInstance, i, i != 0); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + prim.VertexCountPerInstance = draw->indexCount; + prim.StartVertexLocation = draw->firstIndex; + prim.InstanceCount = instanceCount * + pipeline->instance_multiplier; + prim.StartInstanceLocation = firstInstance; + prim.BaseVertexLocation = draw->vertexOffset; + } + } + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); + + trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count); +} + +/* Auto-Draw / Indirect Registers */ +#define GFX7_3DPRIM_END_OFFSET 0x2420 +#define GFX7_3DPRIM_START_VERTEX 0x2430 +#define GFX7_3DPRIM_VERTEX_COUNT 0x2434 +#define GFX7_3DPRIM_INSTANCE_COUNT 0x2438 +#define GFX7_3DPRIM_START_INSTANCE 0x243C +#define GFX7_3DPRIM_BASE_VERTEX 0x2440 + +void genX(CmdDrawIndirectByteCountEXT)( + VkCommandBuffer commandBuffer, + uint32_t instanceCount, + uint32_t firstInstance, + VkBuffer counterBuffer, + VkDeviceSize counterBufferOffset, + uint32_t counterOffset, + uint32_t vertexStride) +{ +#if GFX_VERx10 >= 75 + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + /* firstVertex is always zero for this draw function */ + const uint32_t firstVertex = 0; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect byte count", + instanceCount * pipeline->instance_multiplier); + trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, 0); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + struct mi_value count = + mi_mem32(anv_address_add(counter_buffer->address, + counterBufferOffset)); + if (counterOffset) + count = mi_isub(&b, count, mi_imm(counterOffset)); + count = mi_udiv32_imm(&b, count, vertexStride); + mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count); + + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), + mi_imm(instanceCount * pipeline->instance_multiplier)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance)); + mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); + + trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace, + instanceCount * pipeline->instance_multiplier); +#endif /* GFX_VERx10 >= 75 */ +} + +static void +load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + bool indexed) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), + mi_mem32(anv_address_add(addr, 0))); + + struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4)); + if (pipeline->instance_multiplier > 1) { +#if GFX_VERx10 >= 75 + instance_count = mi_imul_imm(&b, instance_count, + pipeline->instance_multiplier); +#else + anv_finishme("Multiview + indirect draw requires MI_MATH; " + "MI_MATH is not supported on Ivy Bridge"); +#endif + } + mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count); + + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), + mi_mem32(anv_address_add(addr, 8))); + + if (indexed) { + mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), + mi_mem32(anv_address_add(addr, 12))); + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), + mi_mem32(anv_address_add(addr, 16))); + } else { + mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), + mi_mem32(anv_address_add(addr, 12))); + mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0)); + } +} + +void genX(CmdDrawIndirect)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t drawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect", + drawCount); + trace_intel_begin_draw_indirect(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); + + offset += stride; + } + + trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount); +} + +void genX(CmdDrawIndexedIndirect)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t drawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed indirect", + drawCount); + trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); + + offset += stride; + } + + trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount); +} + +static struct mi_value +prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + struct anv_buffer *count_buffer, + uint64_t countBufferOffset) +{ + struct anv_address count_address = + anv_address_add(count_buffer->address, countBufferOffset); + + struct mi_value ret = mi_imm(0); + + if (cmd_buffer->state.conditional_render_enabled) { +#if GFX_VERx10 >= 75 + ret = mi_new_gpr(b); + mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address)); +#endif + } else { + /* Upload the current draw count from the draw parameters buffer to + * MI_PREDICATE_SRC0. + */ + mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address)); + mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0)); + } + + return ret; +} + +static void +emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + uint32_t draw_index) +{ + /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */ + mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index)); + + if (draw_index == 0) { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } else { + /* While draw_index < draw_count the predicate's result will be + * (draw_index == draw_count) ^ TRUE = TRUE + * When draw_index == draw_count the result is + * (TRUE) ^ TRUE = FALSE + * After this all results will be: + * (FALSE) ^ FALSE = FALSE + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_XOR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +} + +#if GFX_VERx10 >= 75 +static void +emit_draw_count_predicate_with_conditional_render( + struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + uint32_t draw_index, + struct mi_value max) +{ + struct mi_value pred = mi_ult(b, mi_imm(draw_index), max); + pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG)); + +#if GFX_VER >= 8 + mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred); +#else + /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser + * so we emit MI_PREDICATE to set it. + */ + + mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred); + mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} +#endif + +static void +emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + uint32_t draw_index, + struct mi_value max) +{ +#if GFX_VERx10 >= 75 + if (cmd_buffer->state.conditional_render_enabled) { + emit_draw_count_predicate_with_conditional_render( + cmd_buffer, b, draw_index, mi_value_ref(b, max)); + } else { + emit_draw_count_predicate(cmd_buffer, b, draw_index); + } +#else + emit_draw_count_predicate(cmd_buffer, b, draw_index); +#endif +} + +void genX(CmdDrawIndirectCount)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indirect count", + 0); + trace_intel_begin_draw_indirect_count(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + struct mi_value max = + prepare_for_draw_count_predicate(cmd_buffer, &b, + count_buffer, countBufferOffset); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); + + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, false); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL); + + offset += stride; + } + + mi_value_unref(&b, max); + + trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount); +} + +void genX(CmdDrawIndexedIndirectCount)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_DRAW, + "draw indexed indirect count", + 0); + trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace); + + genX(cmd_buffer_flush_state)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + struct mi_value max = + prepare_for_draw_count_predicate(cmd_buffer, &b, + count_buffer, countBufferOffset); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); + + /* TODO: We need to stomp base vertex to 0 somehow */ + if (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) + emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12)); + if (vs_prog_data->uses_drawid) + emit_draw_index(cmd_buffer, i); + + /* Emitting draw index or vertex index BOs may result in needing + * additional VF cache flushes. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + load_indirect_parameters(cmd_buffer, draw, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { + prim.IndirectParameterEnable = true; + prim.PredicateEnable = true; + prim.VertexAccessType = RANDOM; + prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology; + } + + update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM); + + offset += stride; + } + + mi_value_unref(&b, max); + + trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount); + +} + +void genX(CmdBeginTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "begin transform feedback"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) { + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + idx >= firstCounterBuffer && + idx - firstCounterBuffer < counterBufferCount && + pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) { + uint32_t cb_idx = idx - firstCounterBuffer; + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { + lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lrm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + } + } else { + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + lri.DataDWord = 0; + } + } + } + + cmd_buffer->state.xfb_enabled = true; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +void genX(CmdEndTransformFeedbackEXT)( + VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer* pCounterBuffers, + const VkDeviceSize* pCounterBufferOffsets) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + assert(firstCounterBuffer < MAX_XFB_BUFFERS); + assert(counterBufferCount <= MAX_XFB_BUFFERS); + assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS); + + /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET: + * + * "Ssoftware must ensure that no HW stream output operations can be in + * process or otherwise pending at the point that the MI_LOAD/STORE + * commands are processed. This will likely require a pipeline flush." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "end transform feedback"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) { + unsigned idx = firstCounterBuffer + cb_idx; + + /* If we have a counter buffer, this is a resume so we need to load the + * value into the streamout offset register. Otherwise, this is a begin + * and we need to reset it to zero. + */ + if (pCounterBuffers && + cb_idx < counterBufferCount && + pCounterBuffers[cb_idx] != VK_NULL_HANDLE) { + ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]); + uint64_t offset = pCounterBufferOffsets ? + pCounterBufferOffsets[cb_idx] : 0; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) { + srm.MemoryAddress = anv_address_add(counter_buffer->address, + offset); + srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4; + } + } + } + + cmd_buffer->state.xfb_enabled = false; + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE; +} + +#if GFX_VERx10 >= 125 +void +genX(CmdDrawMeshTasksNV)( + VkCommandBuffer commandBuffer, + uint32_t taskCount, + uint32_t firstTask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + /* TODO(mesh): Check if this is not emitting more packets than we need. */ + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X + * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1." + */ + assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) { + m.PredicateEnable = cmd_buffer->state.conditional_render_enabled; + m.ThreadGroupCountX = taskCount; + m.StartingThreadGroupIDX = firstTask; + } +} + +#define GFX125_3DMESH_TG_COUNT 0x26F0 +#define GFX125_3DMESH_STARTING_TGID 0x26F4 +#define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */ + +static void +mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + struct anv_address addr, + bool emit_xp0, + uint32_t xp0) +{ + const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount); + const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask); + + mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT), + mi_mem32(anv_address_add(addr, taskCountOff))); + + mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID), + mi_mem32(anv_address_add(addr, firstTaskOff))); + + if (emit_xp0) + mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0)); +} + +static void +emit_indirect_3dmesh_1d(struct anv_batch *batch, + bool predicate_enable, + bool uses_drawid) +{ + uint32_t len = GENX(3DMESH_1D_length) + uses_drawid; + uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_1D), + .PredicateEnable = predicate_enable, + .IndirectParameterEnable = true, + .ExtendedParameter0Present = uses_drawid); + if (uses_drawid) + dw[len - 1] = 0; +} + +void +genX(CmdDrawMeshTasksIndirectNV)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + uint32_t drawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline); + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + if (cmd_state->conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) || + mesh_prog_data->uses_drawid; + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < drawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i); + + emit_indirect_3dmesh_1d(&cmd_buffer->batch, + cmd_state->conditional_render_enabled, uses_drawid); + + offset += stride; + } +} + +void +genX(CmdDrawMeshTasksIndirectCountNV)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkBuffer _countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer); + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline); + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + genX(cmd_buffer_flush_state)(cmd_buffer); + + bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) || + mesh_prog_data->uses_drawid; + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + struct mi_value max = + prepare_for_draw_count_predicate(cmd_buffer, &b, + count_buffer, countBufferOffset); + + for (uint32_t i = 0; i < maxDrawCount; i++) { + struct anv_address draw = anv_address_add(buffer->address, offset); + + emit_draw_count_predicate_cond(cmd_buffer, &b, i, max); + + mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i); + + emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid); + + offset += stride; + } +} +#endif /* GFX_VERx10 >= 125 */ + +void +genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; + struct anv_compute_pipeline *pipeline = comp_state->pipeline; + + assert(pipeline->cs); + + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); + + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + /* Apply any pending pipeline flushes we may have. We want to apply them + * now because, if any of those flushes are for things like push constants, + * the GPU will read the state at weird times. + */ + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + if (cmd_buffer->state.compute.pipeline_dirty) { + /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE: + * + * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless + * the only bits that are changed are scoreboard related: Scoreboard + * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For + * these scoreboard related states, a MEDIA_STATE_FLUSH is + * sufficient." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "flush compute state"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch); + + /* The workgroup size of the pipeline affects our push constant layout + * so flag push constants as dirty if we change the pipeline. + */ + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + + if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) || + cmd_buffer->state.compute.pipeline_dirty) { + flush_descriptor_sets(cmd_buffer, + &cmd_buffer->state.compute.base, + VK_SHADER_STAGE_COMPUTE_BIT, + &pipeline->cs, 1); + cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; + +#if GFX_VERx10 < 125 + uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)]; + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .BindingTablePointer = + cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, + .SamplerStatePointer = + cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc); + + struct anv_state state = + anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw, + pipeline->interface_descriptor_data, + GENX(INTERFACE_DESCRIPTOR_DATA_length), + 64); + + uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); + anv_batch_emit(&cmd_buffer->batch, + GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) { + mid.InterfaceDescriptorTotalLength = size; + mid.InterfaceDescriptorDataStartAddress = state.offset; + } +#endif + } + + if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { + comp_state->push_data = + anv_cmd_buffer_cs_push_constants(cmd_buffer); + +#if GFX_VERx10 < 125 + if (comp_state->push_data.alloc_size) { + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = comp_state->push_data.alloc_size; + curbe.CURBEDataStartAddress = comp_state->push_data.offset; + } + } +#endif + + cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; + } + + cmd_buffer->state.compute.pipeline_dirty = false; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); +} + +#if GFX_VER == 7 + +static VkResult +verify_cmd_parser(const struct anv_device *device, + int required_version, + const char *function) +{ + if (device->physical->cmd_parser_version < required_version) { + return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT, + "cmd parser version %d is required for %s", + required_version, function); + } else { + return VK_SUCCESS; + } +} + +#endif + +static void +anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ) +{ + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + struct anv_push_constants *push = + &cmd_buffer->state.compute.base.push_constants; + if (push->cs.base_work_group_id[0] != baseGroupX || + push->cs.base_work_group_id[1] != baseGroupY || + push->cs.base_work_group_id[2] != baseGroupZ) { + push->cs.base_work_group_id[0] = baseGroupX; + push->cs.base_work_group_id[1] = baseGroupY; + push->cs.base_work_group_id[2] = baseGroupZ; + + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } +} + +void genX(CmdDispatch)( + VkCommandBuffer commandBuffer, + uint32_t x, + uint32_t y, + uint32_t z) +{ + genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z); +} + +#if GFX_VERx10 >= 125 + +static inline void +emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, + const struct anv_compute_pipeline *pipeline, bool indirect, + const struct brw_cs_prog_data *prog_data, + uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ) +{ + struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; + const struct anv_shader_bin *cs_bin = pipeline->cs; + bool predicate = cmd_buffer->state.conditional_render_enabled; + + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, prog_data, NULL); + + anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { + cw.IndirectParameterEnable = indirect; + cw.PredicateEnable = predicate; + cw.SIMDSize = dispatch.simd_size / 16; + cw.IndirectDataStartAddress = comp_state->push_data.offset; + cw.IndirectDataLength = comp_state->push_data.alloc_size; + cw.LocalXMaximum = prog_data->local_size[0] - 1; + cw.LocalYMaximum = prog_data->local_size[1] - 1; + cw.LocalZMaximum = prog_data->local_size[2] - 1; + cw.ThreadGroupIDXDimension = groupCountX; + cw.ThreadGroupIDYDimension = groupCountY; + cw.ThreadGroupIDZDimension = groupCountZ; + cw.ExecutionMask = dispatch.right_mask; + cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0); + + cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .KernelStartPointer = cs_bin->kernel.offset, + .SamplerStatePointer = + cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, + .BindingTablePointer = + cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, + .BindingTableEntryCount = + 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), + .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, + .SharedLocalMemorySize = encode_slm_size(GFX_VER, + prog_data->base.total_shared), + .NumberOfBarriers = prog_data->uses_barrier, + }; + } +} + +#else /* #if GFX_VERx10 >= 125 */ + +static inline void +emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, + const struct anv_compute_pipeline *pipeline, bool indirect, + const struct brw_cs_prog_data *prog_data, + uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ) +{ + bool predicate = (GFX_VER <= 7 && indirect) || + cmd_buffer->state.conditional_render_enabled; + + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, prog_data, NULL); + + anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) { + ggw.IndirectParameterEnable = indirect; + ggw.PredicateEnable = predicate; + ggw.SIMDSize = dispatch.simd_size / 16; + ggw.ThreadDepthCounterMaximum = 0; + ggw.ThreadHeightCounterMaximum = 0; + ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; + ggw.ThreadGroupIDXDimension = groupCountX; + ggw.ThreadGroupIDYDimension = groupCountY; + ggw.ThreadGroupIDZDimension = groupCountZ; + ggw.RightExecutionMask = dispatch.right_mask; + ggw.BottomExecutionMask = 0xffffffff; + } + + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf); +} + +#endif /* #if GFX_VERx10 >= 125 */ + +static inline void +emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, + const struct anv_compute_pipeline *pipeline, bool indirect, + const struct brw_cs_prog_data *prog_data, + uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ) +{ +#if GFX_VERx10 >= 125 + emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, + groupCountY, groupCountZ); +#else + emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX, + groupCountY, groupCountZ); +#endif +} + +void genX(CmdDispatchBase)( + VkCommandBuffer commandBuffer, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + + anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, + baseGroupY, baseGroupZ); + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_COMPUTE, + "compute", + groupCountX * groupCountY * groupCountZ * + prog_data->local_size[0] * prog_data->local_size[1] * + prog_data->local_size[2]); + + trace_intel_begin_compute(&cmd_buffer->trace); + + if (prog_data->uses_num_work_groups) { + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4); + uint32_t *sizes = state.map; + sizes[0] = groupCountX; + sizes[1] = groupCountY; + sizes[2] = groupCountZ; + cmd_buffer->state.compute.num_workgroups = (struct anv_address) { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = state.offset, + }; + + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + + genX(cmd_buffer_flush_compute_state)(cmd_buffer); + + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); + + emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX, + groupCountY, groupCountZ); + + trace_intel_end_compute(&cmd_buffer->trace, + groupCountX, groupCountY, groupCountZ); +} + +#define GPGPU_DISPATCHDIMX 0x2500 +#define GPGPU_DISPATCHDIMY 0x2504 +#define GPGPU_DISPATCHDIMZ 0x2508 + +void genX(CmdDispatchIndirect)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline; + const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); + struct anv_address addr = anv_address_add(buffer->address, offset); + UNUSED struct anv_batch *batch = &cmd_buffer->batch; + + anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); + +#if GFX_VER == 7 + /* Linux 4.4 added command parser version 5 which allows the GPGPU + * indirect dispatch registers to be written. + */ + if (verify_cmd_parser(cmd_buffer->device, 5, + "vkCmdDispatchIndirect") != VK_SUCCESS) + return; +#endif + + anv_measure_snapshot(cmd_buffer, + INTEL_SNAPSHOT_COMPUTE, + "compute indirect", + 0); + trace_intel_begin_compute(&cmd_buffer->trace); + + if (prog_data->uses_num_work_groups) { + cmd_buffer->state.compute.num_workgroups = addr; + + /* The num_workgroups buffer goes in the binding table */ + cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; + } + + genX(cmd_buffer_flush_compute_state)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + struct mi_value size_x = mi_mem32(anv_address_add(addr, 0)); + struct mi_value size_y = mi_mem32(anv_address_add(addr, 4)); + struct mi_value size_z = mi_mem32(anv_address_add(addr, 8)); + + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z); + +#if GFX_VER <= 7 + /* predicate = (compute_dispatch_indirect_x_size == 0); */ + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + + /* predicate |= (compute_dispatch_indirect_y_size == 0); */ + mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + + /* predicate |= (compute_dispatch_indirect_z_size == 0); */ + mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + + /* predicate = !predicate; */ + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_OR; + mip.CompareOperation = COMPARE_FALSE; + } + +#if GFX_VERx10 == 75 + if (cmd_buffer->state.conditional_render_enabled) { + /* predicate &= !(conditional_rendering_predicate == 0); */ + mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), + mi_reg32(ANV_PREDICATE_RESULT_REG)); + anv_batch_emit(batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_AND; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + } +#endif + +#else /* GFX_VER > 7 */ + if (cmd_buffer->state.conditional_render_enabled) + genX(cmd_emit_conditional_render_predicate)(cmd_buffer); +#endif + + emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0); + + trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0); +} + +struct anv_state +genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GFX_VERx10 >= 125 + struct anv_device *device = cmd_buffer->device; + + struct anv_state state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + BRW_RT_DISPATCH_GLOBALS_SIZE, + 64); + struct brw_rt_scratch_layout layout; + uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in + * some cases? + */ + brw_rt_compute_scratch_layout(&layout, device->info, + stack_ids_per_dss, 1 << 10); + + struct GFX_RT_DISPATCH_GLOBALS rtdg = { + .MemBaseAddress = (struct anv_address) { + /* The ray query HW computes offsets from the top of the buffer, so + * let the address at the end of the buffer. + */ + .bo = device->ray_query_bo, + .offset = device->ray_query_bo->size + }, + .AsyncRTStackSize = layout.ray_stack_stride / 64, + .NumDSSRTStacks = layout.stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .ResumeShaderTable = (struct anv_address) { + .bo = cmd_buffer->state.ray_query_shadow_bo, + }, + }; + GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg); + + return state; +#else + unreachable("Not supported"); +#endif +} + +#if GFX_VERx10 >= 125 +static void +calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3]) +{ + unsigned total_shift = 0; + memset(local_shift, 0, 3); + + bool progress; + do { + progress = false; + for (unsigned i = 0; i < 3; i++) { + assert(global[i] > 0); + if ((1 << local_shift[i]) < global[i]) { + progress = true; + local_shift[i]++; + total_shift++; + } + + if (total_shift == 3) + return; + } + } while(progress); + + /* Assign whatever's left to x */ + local_shift[0] += 3 - total_shift; +} + +static struct GFX_RT_SHADER_TABLE +vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region) +{ + return (struct GFX_RT_SHADER_TABLE) { + .BaseAddress = anv_address_from_u64(region->deviceAddress), + .Stride = region->stride, + }; +} + +static void +cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, + const VkStridedDeviceAddressRegionKHR *raygen_sbt, + const VkStridedDeviceAddressRegionKHR *miss_sbt, + const VkStridedDeviceAddressRegionKHR *hit_sbt, + const VkStridedDeviceAddressRegionKHR *callable_sbt, + bool is_indirect, + uint32_t launch_width, + uint32_t launch_height, + uint32_t launch_depth, + uint64_t launch_size_addr) +{ + struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt; + struct anv_ray_tracing_pipeline *pipeline = rt->pipeline; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + /* If we have a known degenerate launch size, just bail */ + if (!is_indirect && + (launch_width == 0 || launch_height == 0 || launch_depth == 0)) + return; + + genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config); + genX(flush_pipeline_select_gpgpu)(cmd_buffer); + + cmd_buffer->state.rt.pipeline_dirty = false; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Add these to the reloc list as they're internal buffers that don't + * actually have relocs to pick them up manually. + * + * TODO(RT): This is a bit of a hack + */ + anv_reloc_list_add_bo(cmd_buffer->batch.relocs, + cmd_buffer->batch.alloc, + rt->scratch.bo); + + /* Allocate and set up our RT_DISPATCH_GLOBALS */ + struct anv_state rtdg_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + BRW_RT_PUSH_CONST_OFFSET + + sizeof(struct anv_push_constants), + 64); + + struct GFX_RT_DISPATCH_GLOBALS rtdg = { + .MemBaseAddress = (struct anv_address) { + .bo = rt->scratch.bo, + .offset = rt->scratch.layout.ray_stack_start, + }, + .CallStackHandler = + anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0), + .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64, + .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss, + .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS, + .Flags = RT_DEPTH_TEST_LESS_EQUAL, + .HitGroupTable = vk_sdar_to_shader_table(hit_sbt), + .MissGroupTable = vk_sdar_to_shader_table(miss_sbt), + .SWStackSize = rt->scratch.layout.sw_stack_size / 64, + .LaunchWidth = launch_width, + .LaunchHeight = launch_height, + .LaunchDepth = launch_depth, + .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt), + }; + GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg); + + /* Push constants go after the RT_DISPATCH_GLOBALS */ + assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET); + memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET, + &cmd_buffer->state.rt.base.push_constants, + sizeof(struct anv_push_constants)); + + struct anv_address rtdg_addr = { + .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo, + .offset = rtdg_state.offset, + }; + + uint8_t local_size_log2[3]; + uint32_t global_size[3] = {}; + if (is_indirect) { + /* Pick a local size that's probably ok. We assume most TraceRays calls + * will use a two-dimensional dispatch size. Worst case, our initial + * dispatch will be a little slower than it has to be. + */ + local_size_log2[0] = 2; + local_size_log2[1] = 1; + local_size_log2[2] = 0; + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + struct mi_value launch_size[3] = { + mi_mem32(anv_address_from_u64(launch_size_addr + 0)), + mi_mem32(anv_address_from_u64(launch_size_addr + 4)), + mi_mem32(anv_address_from_u64(launch_size_addr + 8)), + }; + + /* Store the original launch size into RT_DISPATCH_GLOBALS + * + * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets + * moved into a genX version. + */ + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)), + mi_value_ref(&b, launch_size[0])); + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)), + mi_value_ref(&b, launch_size[1])); + mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)), + mi_value_ref(&b, launch_size[2])); + + /* Compute the global dispatch size */ + for (unsigned i = 0; i < 3; i++) { + if (local_size_log2[i] == 0) + continue; + + /* global_size = DIV_ROUND_UP(launch_size, local_size) + * + * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm + * has the semantics of shifting the enture 64-bit value and taking + * the bottom 32 so we don't have to worry about roll-over. + */ + uint32_t local_size = 1 << local_size_log2[i]; + launch_size[i] = mi_iadd(&b, launch_size[i], + mi_imm(local_size - 1)); + launch_size[i] = mi_ushr32_imm(&b, launch_size[i], + local_size_log2[i]); + } + + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]); + mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]); + } else { + uint32_t launch_size[3] = { launch_width, launch_height, launch_depth }; + calc_local_trace_size(local_size_log2, launch_size); + + for (unsigned i = 0; i < 3; i++) { + /* We have to be a bit careful here because DIV_ROUND_UP adds to the + * numerator value may overflow. Cast to uint64_t to avoid this. + */ + uint32_t local_size = 1 << local_size_log2[i]; + global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size); + } + } + + anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { + cw.IndirectParameterEnable = is_indirect; + cw.PredicateEnable = false; + cw.SIMDSize = SIMD8; + cw.LocalXMaximum = (1 << local_size_log2[0]) - 1; + cw.LocalYMaximum = (1 << local_size_log2[1]) - 1; + cw.LocalZMaximum = (1 << local_size_log2[2]) - 1; + cw.ThreadGroupIDXDimension = global_size[0]; + cw.ThreadGroupIDYDimension = global_size[1]; + cw.ThreadGroupIDZDimension = global_size[2]; + cw.ExecutionMask = 0xff; + cw.EmitInlineParameter = true; + cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0); + + const gl_shader_stage s = MESA_SHADER_RAYGEN; + struct anv_device *device = cmd_buffer->device; + struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; + struct anv_state *samplers = &cmd_buffer->state.samplers[s]; + cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .KernelStartPointer = device->rt_trampoline->kernel.offset, + .SamplerStatePointer = samplers->offset, + /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */ + .SamplerCount = 0, + .BindingTablePointer = surfaces->offset, + .NumberofThreadsinGPGPUThreadGroup = 1, + .BTDMode = true, + }; + + struct brw_rt_raygen_trampoline_params trampoline_params = { + .rt_disp_globals_addr = anv_address_physical(rtdg_addr), + .raygen_bsr_addr = raygen_sbt->deviceAddress, + .is_indirect = is_indirect, + .local_group_size_log2 = { + local_size_log2[0], + local_size_log2[1], + local_size_log2[2], + }, + }; + STATIC_ASSERT(sizeof(trampoline_params) == 32); + memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params)); + } +} + +void +genX(CmdTraceRaysKHR)( + VkCommandBuffer commandBuffer, + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, + uint32_t width, + uint32_t height, + uint32_t depth) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer_trace_rays(cmd_buffer, + pRaygenShaderBindingTable, + pMissShaderBindingTable, + pHitShaderBindingTable, + pCallableShaderBindingTable, + false /* is_indirect */, + width, height, depth, + 0 /* launch_size_addr */); +} + +void +genX(CmdTraceRaysIndirectKHR)( + VkCommandBuffer commandBuffer, + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable, + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, + VkDeviceAddress indirectDeviceAddress) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer_trace_rays(cmd_buffer, + pRaygenShaderBindingTable, + pMissShaderBindingTable, + pHitShaderBindingTable, + pCallableShaderBindingTable, + true /* is_indirect */, + 0, 0, 0, /* width, height, depth, */ + indirectDeviceAddress); +} +#endif /* GFX_VERx10 >= 125 */ + +static void +genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, + uint32_t pipeline) +{ + UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info; + + if (cmd_buffer->state.current_pipeline == pipeline) + return; + +#if GFX_VER >= 8 && GFX_VER < 10 + /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT: + * + * Software must clear the COLOR_CALC_STATE Valid field in + * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT + * with Pipeline Select set to GPGPU. + * + * The internal hardware docs recommend the same workaround for Gfx9 + * hardware too. + */ + if (pipeline == GPGPU) + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t); +#endif + +#if GFX_VER == 9 + if (pipeline == _3D) { + /* There is a mid-object preemption workaround which requires you to + * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D. However, + * even without preemption, we have issues with geometry flickering when + * GPGPU and 3D are back-to-back and this seems to fix it. We don't + * really know why. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) { + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * devinfo->subslice_total - 1; + vfe.NumberofURBEntries = 2; + vfe.URBEntryAllocationSize = 2; + } + + /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is + * invalid. Set the compute pipeline to dirty to force a re-emit of the + * pipeline in case we get back-to-back dispatch calls with the same + * pipeline and a PIPELINE_SELECT in between. + */ + cmd_buffer->state.compute.pipeline_dirty = true; + } +#endif + + /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction] + * PIPELINE_SELECT [DevBWR+]": + * + * Project: DEVSNB+ + * + * Software must ensure all the write caches are flushed through a + * stalling PIPE_CONTROL command followed by another PIPE_CONTROL + * command to invalidate read only caches prior to programming + * MI_PIPELINE_SELECT command to change the Pipeline Select Mode. + * + * Note the cmd_buffer_apply_pipe_flushes will split this into two + * PIPE_CONTROLs. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | + ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | + ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | + ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, + "flush and invalidate for PIPELINE_SELECT"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) { +#if GFX_VER >= 9 + ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; + ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; +#endif + ps.PipelineSelection = pipeline; + } + +#if GFX_VER == 9 + if (devinfo->platform == INTEL_PLATFORM_GLK) { + /* Project: DevGLK + * + * "This chicken bit works around a hardware issue with barrier logic + * encountered when switching between GPGPU and 3D pipelines. To + * workaround the issue, this mode bit should be set after a pipeline + * is selected." + */ + anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) { + scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU + : GLK_BARRIER_MODE_3D_HULL; + scec1.GLKBarrierModeMask = 1; + } + } +#endif + + cmd_buffer->state.current_pipeline = pipeline; +} + +void +genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer) +{ + genX(flush_pipeline_select)(cmd_buffer, _3D); +} + +void +genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer) +{ + genX(flush_pipeline_select)(cmd_buffer, GPGPU); +} + +void +genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer) +{ + if (GFX_VER >= 8) + return; + + /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER: + * + * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any + * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, + * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first + * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit + * set), followed by a pipelined depth cache flush (PIPE_CONTROL with + * Depth Flush Bit set, followed by another pipelined depth stall + * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise + * guarantee that the pipeline from WM onwards is already flushed (e.g., + * via a preceding MI_FLUSH)." + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthStallEnable = true; + anv_debug_dump_pc(pipe); + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthCacheFlushEnable = true; +#if GFX_VER >= 12 + pipe.TileCacheFlushEnable = true; +#endif + anv_debug_dump_pc(pipe); + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) { + pipe.DepthStallEnable = true; + anv_debug_dump_pc(pipe); + } +} + +void +genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer, + const struct isl_surf *surf) +{ +#if GFX_VERx10 == 120 + const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM && + surf->samples == 1; + + switch (cmd_buffer->state.depth_reg_mode) { + case ANV_DEPTH_REG_MODE_HW_DEFAULT: + if (!is_d16_1x_msaa) + return; + break; + case ANV_DEPTH_REG_MODE_D16_1X_MSAA: + if (is_d16_1x_msaa) + return; + break; + case ANV_DEPTH_REG_MODE_UNKNOWN: + break; + } + + /* We'll change some CHICKEN registers depending on the depth surface + * format. Do a depth flush and stall so the pipeline is not using these + * settings while we change the registers. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | + ANV_PIPE_DEPTH_STALL_BIT | + ANV_PIPE_END_OF_PIPE_SYNC_BIT, + "Workaround: Stop pipeline for 14010455700"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Wa_14010455700 + * + * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer + * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”. + */ + anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) { + reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa; + reg.HIZPlaneOptimizationdisablebitMask = true; + } + + cmd_buffer->state.depth_reg_mode = + is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA : + ANV_DEPTH_REG_MODE_HW_DEFAULT; +#endif +} + +/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS: + * + * "The VF cache needs to be invalidated before binding and then using + * Vertex Buffers that overlap with any previously bound Vertex Buffer + * (at a 64B granularity) since the last invalidation. A VF cache + * invalidate is performed by setting the "VF Cache Invalidation Enable" + * bit in PIPE_CONTROL." + * + * This is implemented by carefully tracking all vertex and index buffer + * bindings and flushing if the cache ever ends up with a range in the cache + * that would exceed 4 GiB. This is implemented in three parts: + * + * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called + * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the + * tracking code of the new binding. If this new binding would cause + * the cache to have a too-large range on the next draw call, a pipeline + * stall and VF cache invalidate are added to pending_pipeline_bits. + * + * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to + * empty whenever we emit a VF invalidate. + * + * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called + * after every 3DPRIMITIVE and copies the bound range into the dirty + * range for each used buffer. This has to be a separate step because + * we don't always re-bind all buffers and so 1. can't know which + * buffers are actually bound. + */ +void +genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + int vb_index, + struct anv_address vb_address, + uint32_t vb_size) +{ + if (GFX_VER < 8 || GFX_VER > 9 || + anv_use_relocations(cmd_buffer->device->physical)) + return; + + struct anv_vb_cache_range *bound, *dirty; + if (vb_index == -1) { + bound = &cmd_buffer->state.gfx.ib_bound_range; + dirty = &cmd_buffer->state.gfx.ib_dirty_range; + } else { + assert(vb_index >= 0); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index]; + } + + if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty, + vb_address, + vb_size)) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_VF_CACHE_INVALIDATE_BIT, + "vb > 32b range"); + } +} + +void +genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer, + uint32_t access_type, + uint64_t vb_used) +{ + if (GFX_VER < 8 || GFX_VER > 9 || + anv_use_relocations(cmd_buffer->device->physical)) + return; + + if (access_type == RANDOM) { + /* We have an index buffer */ + struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range; + struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } + + uint64_t mask = vb_used; + while (mask) { + int i = u_bit_scan64(&mask); + assert(i >= 0); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges)); + assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges)); + + struct anv_vb_cache_range *bound, *dirty; + bound = &cmd_buffer->state.gfx.vb_bound_ranges[i]; + dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i]; + + if (bound->end > bound->start) { + dirty->start = MIN2(dirty->start, bound->start); + dirty->end = MAX2(dirty->end, bound->end); + } + } +} + +/** + * Update the pixel hashing modes that determine the balancing of PS threads + * across subslices and slices. + * + * \param width Width bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param height Height bound of the rendering area (already scaled down if \p + * scale is greater than 1). + * \param scale The number of framebuffer samples that could potentially be + * affected by an individual channel of the PS thread. This is + * typically one for single-sampled rendering, but for operations + * like CCS resolves and fast clears a single PS invocation may + * update a huge number of pixels, in which case a finer + * balancing is desirable in order to maximally utilize the + * bandwidth available. UINT_MAX can be used as shorthand for + * "finest hashing mode available". + */ +void +genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, + unsigned width, unsigned height, + unsigned scale) +{ +#if GFX_VER == 9 + const struct intel_device_info *devinfo = cmd_buffer->device->info; + const unsigned slice_hashing[] = { + /* Because all Gfx9 platforms with more than one slice require + * three-way subslice hashing, a single "normal" 16x16 slice hashing + * block is guaranteed to suffer from substantial imbalance, with one + * subslice receiving twice as much work as the other two in the + * slice. + * + * The performance impact of that would be particularly severe when + * three-way hashing is also in use for slice balancing (which is the + * case for all Gfx9 GT4 platforms), because one of the slices + * receives one every three 16x16 blocks in either direction, which + * is roughly the periodicity of the underlying subslice imbalance + * pattern ("roughly" because in reality the hardware's + * implementation of three-way hashing doesn't do exact modulo 3 + * arithmetic, which somewhat decreases the magnitude of this effect + * in practice). This leads to a systematic subslice imbalance + * within that slice regardless of the size of the primitive. The + * 32x32 hashing mode guarantees that the subslice imbalance within a + * single slice hashing block is minimal, largely eliminating this + * effect. + */ + _32x32, + /* Finest slice hashing mode available. */ + NORMAL + }; + const unsigned subslice_hashing[] = { + /* 16x16 would provide a slight cache locality benefit especially + * visible in the sampler L1 cache efficiency of low-bandwidth + * non-LLC platforms, but it comes at the cost of greater subslice + * imbalance for primitives of dimensions approximately intermediate + * between 16x4 and 16x16. + */ + _16x4, + /* Finest subslice hashing mode available. */ + _8x4 + }; + /* Dimensions of the smallest hashing block of a given hashing mode. If + * the rendering area is smaller than this there can't possibly be any + * benefit from switching to this mode, so we optimize out the + * transition. + */ + const unsigned min_size[][2] = { + { 16, 4 }, + { 8, 4 } + }; + const unsigned idx = scale > 1; + + if (cmd_buffer->state.current_hash_scale != scale && + (width > min_size[idx][0] || height > min_size[idx][1])) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "change pixel hash mode"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) { + gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0); + gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0); + gt.SubsliceHashing = subslice_hashing[idx]; + gt.SubsliceHashingMask = -1; + } + + cmd_buffer->state.current_hash_scale = scale; + } +#endif +} + +static void +cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_device *device = cmd_buffer->device; + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + /* FIXME: Width and Height are wrong */ + + genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer); + + uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, + device->isl_dev.ds.size / 4); + if (dw == NULL) + return; + + struct isl_view isl_view = {}; + struct isl_depth_stencil_hiz_emit_info info = { + .view = &isl_view, + .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT), + }; + + if (gfx->depth_att.iview != NULL) { + isl_view = gfx->depth_att.iview->planes[0].isl; + } else if (gfx->stencil_att.iview != NULL) { + isl_view = gfx->stencil_att.iview->planes[0].isl; + } + + if (gfx->view_mask) { + assert(isl_view.array_len == 0 || + isl_view.array_len >= util_last_bit(gfx->view_mask)); + isl_view.array_len = util_last_bit(gfx->view_mask); + } else { + assert(isl_view.array_len == 0 || + isl_view.array_len >= util_last_bit(gfx->layer_count)); + isl_view.array_len = gfx->layer_count; + } + + if (gfx->depth_att.iview != NULL) { + const struct anv_image_view *iview = gfx->depth_att.iview; + const struct anv_image *image = iview->image; + + const uint32_t depth_plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT); + const struct anv_surface *depth_surface = + &image->planes[depth_plane].primary_surface; + const struct anv_address depth_address = + anv_image_address(image, &depth_surface->memory_range); + + info.depth_surf = &depth_surface->isl; + + info.depth_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.depth_offset / 4, + depth_address.bo, depth_address.offset); + info.mocs = + anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT); + + info.hiz_usage = gfx->depth_att.aux_usage; + if (info.hiz_usage != ISL_AUX_USAGE_NONE) { + assert(isl_aux_usage_has_hiz(info.hiz_usage)); + + const struct anv_surface *hiz_surface = + &image->planes[depth_plane].aux_surface; + const struct anv_address hiz_address = + anv_image_address(image, &hiz_surface->memory_range); + + info.hiz_surf = &hiz_surface->isl; + + info.hiz_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.hiz_offset / 4, + hiz_address.bo, hiz_address.offset); + + info.depth_clear_value = ANV_HZ_FC_VAL; + } + } + + if (gfx->stencil_att.iview != NULL) { + const struct anv_image_view *iview = gfx->stencil_att.iview; + const struct anv_image *image = iview->image; + + const uint32_t stencil_plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + const struct anv_surface *stencil_surface = + &image->planes[stencil_plane].primary_surface; + const struct anv_address stencil_address = + anv_image_address(image, &stencil_surface->memory_range); + + info.stencil_surf = &stencil_surface->isl; + + info.stencil_aux_usage = image->planes[stencil_plane].aux_usage; + info.stencil_address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.ds.stencil_offset / 4, + stencil_address.bo, stencil_address.offset); + info.mocs = + anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT); + } + + isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info); + + if (info.depth_surf) + genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf); + + if (GFX_VER >= 12) { + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + /* Wa_1408224581 + * + * Workaround: Gfx12LP Astep only An additional pipe control with + * post-sync = store dword operation would be required.( w/a is to + * have an additional pipe control after the stencil state whenever + * the surface state bits of this state is changing). + * + * This also seems sufficient to handle Wa_14014148106. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteImmediateData; + pc.Address = cmd_buffer->device->workaround_address; + } + } + cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage); +} + +static void +cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer, + const struct anv_image_view *fsr_iview) +{ +#if GFX_VERx10 >= 125 + struct anv_device *device = cmd_buffer->device; + + if (!device->vk.enabled_extensions.KHR_fragment_shading_rate) + return; + + uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch, + device->isl_dev.cpb.size / 4); + if (dw == NULL) + return; + + struct isl_cpb_emit_info info = { }; + + if (fsr_iview) { + info.view = &fsr_iview->planes[0].isl; + info.surf = &fsr_iview->image->planes[0].primary_surface.isl; + info.address = + anv_batch_emit_reloc(&cmd_buffer->batch, + dw + device->isl_dev.cpb.offset / 4, + fsr_iview->image->bindings[0].address.bo, + fsr_iview->image->bindings[0].address.offset + + fsr_iview->image->bindings[0].memory_range.offset); + info.mocs = + anv_mocs(device, fsr_iview->image->bindings[0].address.bo, + ISL_SURF_USAGE_CPB_BIT); + } + + isl_emit_cpb_control_s(&device->isl_dev, dw, &info); +#endif /* GFX_VERx10 >= 125 */ +} + +static VkImageLayout +attachment_initial_layout(const VkRenderingAttachmentInfo *att) +{ + const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info = + vk_find_struct_const(att->pNext, + RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA); + if (layout_info != NULL) + return layout_info->initialLayout; + + return att->imageLayout; +} + +void genX(CmdBeginRendering)( + VkCommandBuffer commandBuffer, + const VkRenderingInfo* pRenderingInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + VkResult result; + + if (!is_render_queue_cmd_buffer(cmd_buffer)) { + assert(!"Trying to start a render pass on non-render queue!"); + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN); + return; + } + + anv_measure_beginrenderpass(cmd_buffer); + trace_intel_begin_render_pass(&cmd_buffer->trace); + + gfx->rendering_flags = pRenderingInfo->flags; + gfx->render_area = pRenderingInfo->renderArea; + gfx->view_mask = pRenderingInfo->viewMask; + gfx->layer_count = pRenderingInfo->layerCount; + gfx->samples = 0; + + const bool is_multiview = gfx->view_mask != 0; + const VkRect2D render_area = gfx->render_area; + const uint32_t layers = + is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count; + + /* The framebuffer size is at least large enough to contain the render + * area. Because a zero renderArea is possible, we MAX with 1. + */ + struct isl_extent3d fb_size = { + .w = MAX2(1, render_area.offset.x + render_area.extent.width), + .h = MAX2(1, render_area.offset.y + render_area.extent.height), + .d = layers, + }; + + const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount; + result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count); + if (result != VK_SUCCESS) + return; + + genX(flush_pipeline_select_3d)(cmd_buffer); + + for (uint32_t i = 0; i < gfx->color_att_count; i++) { + if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE) + continue; + + const VkRenderingAttachmentInfo *att = + &pRenderingInfo->pColorAttachments[i]; + ANV_FROM_HANDLE(anv_image_view, iview, att->imageView); + const VkImageLayout initial_layout = attachment_initial_layout(att); + + assert(render_area.offset.x + render_area.extent.width <= + iview->vk.extent.width); + assert(render_area.offset.y + render_area.extent.height <= + iview->vk.extent.height); + assert(layers <= iview->vk.layer_count); + + fb_size.w = MAX2(fb_size.w, iview->vk.extent.width); + fb_size.h = MAX2(fb_size.h, iview->vk.extent.height); + + assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples); + gfx->samples |= iview->vk.image->samples; + + enum isl_aux_usage aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, + iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + att->imageLayout); + + union isl_color_value fast_clear_color = { .u32 = { 0, } }; + + if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR && + !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) { + const union isl_color_value clear_color = + vk_to_isl_color_with_format(att->clearValue.color, + iview->planes[0].isl.format); + + /* We only support fast-clears on the first layer */ + const bool fast_clear = + (!is_multiview || (gfx->view_mask & 1)) && + anv_can_fast_clear_color_view(cmd_buffer->device, iview, + att->imageLayout, clear_color, + layers, render_area); + + if (att->imageLayout != initial_layout) { + assert(render_area.offset.x == 0 && render_area.offset.y == 0 && + render_area.extent.width == iview->vk.extent.width && + render_area.extent.height == iview->vk.extent.height); + if (is_multiview) { + u_foreach_bit(view, gfx->view_mask) { + transition_color_buffer(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + iview->vk.base_mip_level, 1, + iview->vk.base_array_layer + view, + 1, /* layer_count */ + initial_layout, att->imageLayout, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + fast_clear); + } + } else { + transition_color_buffer(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + iview->vk.base_mip_level, 1, + iview->vk.base_array_layer, + gfx->layer_count, + initial_layout, att->imageLayout, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + fast_clear); + } + } + + uint32_t clear_view_mask = pRenderingInfo->viewMask; + uint32_t base_clear_layer = iview->vk.base_array_layer; + uint32_t clear_layer_count = gfx->layer_count; + if (fast_clear) { + /* We only support fast-clears on the first layer */ + assert(iview->vk.base_mip_level == 0 && + iview->vk.base_array_layer == 0); + + fast_clear_color = clear_color; + + if (iview->image->vk.samples == 1) { + anv_image_ccs_op(cmd_buffer, iview->image, + iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, + VK_IMAGE_ASPECT_COLOR_BIT, + 0, 0, 1, ISL_AUX_OP_FAST_CLEAR, + &fast_clear_color, + false); + } else { + anv_image_mcs_op(cmd_buffer, iview->image, + iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, + VK_IMAGE_ASPECT_COLOR_BIT, + 0, 1, ISL_AUX_OP_FAST_CLEAR, + &fast_clear_color, + false); + } + clear_view_mask &= ~1u; + base_clear_layer++; + clear_layer_count--; + + if (isl_color_value_is_zero(clear_color, + iview->planes[0].isl.format)) { + /* This image has the auxiliary buffer enabled. We can mark the + * subresource as not needing a resolve because the clear color + * will match what's in every RENDER_SURFACE_STATE object when + * it's being used for sampling. + */ + set_image_fast_clear_state(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + ANV_FAST_CLEAR_DEFAULT_VALUE); + } else { + set_image_fast_clear_state(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + ANV_FAST_CLEAR_ANY); + } + } + + if (is_multiview) { + u_foreach_bit(view, clear_view_mask) { + anv_image_clear_color(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + aux_usage, + iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, + iview->vk.base_mip_level, + iview->vk.base_array_layer + view, 1, + render_area, clear_color); + } + } else { + anv_image_clear_color(cmd_buffer, iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + aux_usage, + iview->planes[0].isl.format, + iview->planes[0].isl.swizzle, + iview->vk.base_mip_level, + base_clear_layer, clear_layer_count, + render_area, clear_color); + } + } else { + /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */ + assert(att->imageLayout == initial_layout); + } + + gfx->color_att[i].vk_format = iview->vk.format; + gfx->color_att[i].iview = iview; + gfx->color_att[i].layout = att->imageLayout; + gfx->color_att[i].aux_usage = aux_usage; + + struct isl_view isl_view = iview->planes[0].isl; + if (pRenderingInfo->viewMask) { + assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask)); + isl_view.array_len = util_last_bit(pRenderingInfo->viewMask); + } else { + assert(isl_view.array_len >= pRenderingInfo->layerCount); + isl_view.array_len = pRenderingInfo->layerCount; + } + + anv_image_fill_surface_state(cmd_buffer->device, + iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + &isl_view, + ISL_SURF_USAGE_RENDER_TARGET_BIT, + aux_usage, &fast_clear_color, + 0, /* anv_image_view_state_flags */ + &gfx->color_att[i].surface_state, + NULL); + + add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state); + + if (GFX_VER < 10 && + (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || + (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) && + iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE && + iview->planes[0].isl.base_level == 0 && + iview->planes[0].isl.base_array_layer == 0) { + genX(copy_fast_clear_dwords)(cmd_buffer, + gfx->color_att[i].surface_state.state, + iview->image, + VK_IMAGE_ASPECT_COLOR_BIT, + false /* copy to ss */); + } + + if (att->resolveMode != VK_RESOLVE_MODE_NONE) { + gfx->color_att[i].resolve_mode = att->resolveMode; + gfx->color_att[i].resolve_iview = + anv_image_view_from_handle(att->resolveImageView); + gfx->color_att[i].resolve_layout = att->resolveImageLayout; + } + } + + const struct anv_image_view *fsr_iview = NULL; + const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att = + vk_find_struct_const(pRenderingInfo->pNext, + RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR); + if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) { + fsr_iview = anv_image_view_from_handle(fsr_att->imageView); + /* imageLayout and shadingRateAttachmentTexelSize are ignored */ + } + + const struct anv_image_view *ds_iview = NULL; + const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment; + const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment; + if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) || + (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) { + const struct anv_image_view *d_iview = NULL, *s_iview = NULL; + VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED; + VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED; + VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED; + VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED; + enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE; + enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE; + float depth_clear_value = 0; + uint32_t stencil_clear_value = 0; + + if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) { + d_iview = anv_image_view_from_handle(d_att->imageView); + initial_depth_layout = attachment_initial_layout(d_att); + depth_layout = d_att->imageLayout; + depth_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, + d_iview->image, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + depth_layout); + depth_clear_value = d_att->clearValue.depthStencil.depth; + } + + if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) { + s_iview = anv_image_view_from_handle(s_att->imageView); + initial_stencil_layout = attachment_initial_layout(s_att); + stencil_layout = s_att->imageLayout; + stencil_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, + s_iview->image, + VK_IMAGE_ASPECT_STENCIL_BIT, + VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT, + stencil_layout); + stencil_clear_value = s_att->clearValue.depthStencil.stencil; + } + + assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview); + ds_iview = d_iview != NULL ? d_iview : s_iview; + assert(ds_iview != NULL); + + assert(render_area.offset.x + render_area.extent.width <= + ds_iview->vk.extent.width); + assert(render_area.offset.y + render_area.extent.height <= + ds_iview->vk.extent.height); + assert(layers <= ds_iview->vk.layer_count); + + fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width); + fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height); + + assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples); + gfx->samples |= ds_iview->vk.image->samples; + + VkImageAspectFlags clear_aspects = 0; + if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR && + !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) + clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR && + !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) + clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + + if (clear_aspects != 0) { + const bool hiz_clear = + anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview, + depth_layout, clear_aspects, + depth_clear_value, + render_area); + + if (depth_layout != initial_depth_layout) { + assert(render_area.offset.x == 0 && render_area.offset.y == 0 && + render_area.extent.width == d_iview->vk.extent.width && + render_area.extent.height == d_iview->vk.extent.height); + + if (is_multiview) { + u_foreach_bit(view, gfx->view_mask) { + transition_depth_buffer(cmd_buffer, d_iview->image, + d_iview->vk.base_array_layer + view, + 1 /* layer_count */, + initial_depth_layout, depth_layout, + hiz_clear); + } + } else { + transition_depth_buffer(cmd_buffer, d_iview->image, + d_iview->vk.base_array_layer, + gfx->layer_count, + initial_depth_layout, depth_layout, + hiz_clear); + } + } + + if (stencil_layout != initial_stencil_layout) { + assert(render_area.offset.x == 0 && render_area.offset.y == 0 && + render_area.extent.width == s_iview->vk.extent.width && + render_area.extent.height == s_iview->vk.extent.height); + + if (is_multiview) { + u_foreach_bit(view, gfx->view_mask) { + transition_stencil_buffer(cmd_buffer, s_iview->image, + s_iview->vk.base_mip_level, 1, + s_iview->vk.base_array_layer + view, + 1 /* layer_count */, + initial_stencil_layout, + stencil_layout, + hiz_clear); + } + } else { + transition_stencil_buffer(cmd_buffer, s_iview->image, + s_iview->vk.base_mip_level, 1, + s_iview->vk.base_array_layer, + gfx->layer_count, + initial_stencil_layout, + stencil_layout, + hiz_clear); + } + } + + if (is_multiview) { + uint32_t clear_view_mask = pRenderingInfo->viewMask; + while (clear_view_mask) { + int view = u_bit_scan(&clear_view_mask); + + uint32_t level = ds_iview->vk.base_mip_level; + uint32_t layer = ds_iview->vk.base_array_layer + view; + + if (hiz_clear) { + anv_image_hiz_clear(cmd_buffer, ds_iview->image, + clear_aspects, + level, layer, 1, + render_area, + stencil_clear_value); + } else { + anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image, + clear_aspects, + depth_aux_usage, + level, layer, 1, + render_area, + depth_clear_value, + stencil_clear_value); + } + } + } else { + uint32_t level = ds_iview->vk.base_mip_level; + uint32_t base_layer = ds_iview->vk.base_array_layer; + uint32_t layer_count = gfx->layer_count; + + if (hiz_clear) { + anv_image_hiz_clear(cmd_buffer, ds_iview->image, + clear_aspects, + level, base_layer, layer_count, + render_area, + stencil_clear_value); + } else { + anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image, + clear_aspects, + depth_aux_usage, + level, base_layer, layer_count, + render_area, + depth_clear_value, + stencil_clear_value); + } + } + } else { + /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */ + assert(depth_layout == initial_depth_layout); + assert(stencil_layout == initial_stencil_layout); + } + + if (d_iview != NULL) { + gfx->depth_att.vk_format = d_iview->vk.format; + gfx->depth_att.iview = d_iview; + gfx->depth_att.layout = depth_layout; + gfx->depth_att.aux_usage = depth_aux_usage; + if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) { + assert(d_att->resolveImageView != VK_NULL_HANDLE); + gfx->depth_att.resolve_mode = d_att->resolveMode; + gfx->depth_att.resolve_iview = + anv_image_view_from_handle(d_att->resolveImageView); + gfx->depth_att.resolve_layout = d_att->resolveImageLayout; + } + } + + if (s_iview != NULL) { + gfx->stencil_att.vk_format = s_iview->vk.format; + gfx->stencil_att.iview = s_iview; + gfx->stencil_att.layout = stencil_layout; + gfx->stencil_att.aux_usage = stencil_aux_usage; + if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) { + assert(s_att->resolveImageView != VK_NULL_HANDLE); + gfx->stencil_att.resolve_mode = s_att->resolveMode; + gfx->stencil_att.resolve_iview = + anv_image_view_from_handle(s_att->resolveImageView); + gfx->stencil_att.resolve_layout = s_att->resolveImageLayout; + } + } + } + + /* Finally, now that we know the right size, set up the null surface */ + assert(util_bitcount(gfx->samples) <= 1); + isl_null_fill_state(&cmd_buffer->device->isl_dev, + gfx->null_surface_state.map, + .size = fb_size); + + for (uint32_t i = 0; i < gfx->color_att_count; i++) { + if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) + continue; + + isl_null_fill_state(&cmd_buffer->device->isl_dev, + gfx->color_att[i].surface_state.state.map, + .size = fb_size); + } + + /****** We can now start emitting code to begin the render pass ******/ + + gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top + * of each subpass. + */ + if (GFX_VER == 7) + gfx->vb_dirty |= ~0; + + /* It is possible to start a render pass with an old pipeline. Because the + * render pass and subpass index are both baked into the pipeline, this is + * highly unlikely. In order to do so, it requires that you have a render + * pass with a single subpass and that you use that render pass twice + * back-to-back and use the same pipeline at the start of the second render + * pass as at the end of the first. In order to avoid unpredictable issues + * with this edge case, we just dirty the pipeline at the start of every + * subpass. + */ + gfx->dirty |= ANV_CMD_DIRTY_PIPELINE; + +#if GFX_VER >= 11 + /* The PIPE_CONTROL command description says: + * + * "Whenever a Binding Table Index (BTI) used by a Render Target Message + * points to a different RENDER_SURFACE_STATE, SW must issue a Render + * Target Cache Flush by enabling this bit. When render target flush + * is set due to new association of BTI, PS Scoreboard Stall bit must + * be set in this packet." + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | + ANV_PIPE_STALL_AT_SCOREBOARD_BIT, + "change RT"); +#endif + + cmd_buffer_emit_depth_stencil(cmd_buffer); + + cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview); +} + +static void +cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer, + struct anv_attachment *att, + VkImageAspectFlagBits aspect) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct anv_image_view *iview = att->iview; + + if (gfx->view_mask == 0) { + genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, + aspect, att->aux_usage, + iview->planes[0].isl.base_level, + iview->planes[0].isl.base_array_layer, + gfx->layer_count); + } else { + uint32_t res_view_mask = gfx->view_mask; + while (res_view_mask) { + int i = u_bit_scan(&res_view_mask); + + const uint32_t level = iview->planes[0].isl.base_level; + const uint32_t layer = iview->planes[0].isl.base_array_layer + i; + + genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image, + aspect, att->aux_usage, + level, layer, 1); + } + } +} + +static enum blorp_filter +vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode) +{ + switch (vk_mode) { + case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT: + return BLORP_FILTER_SAMPLE_0; + case VK_RESOLVE_MODE_AVERAGE_BIT: + return BLORP_FILTER_AVERAGE; + case VK_RESOLVE_MODE_MIN_BIT: + return BLORP_FILTER_MIN_SAMPLE; + case VK_RESOLVE_MODE_MAX_BIT: + return BLORP_FILTER_MAX_SAMPLE; + default: + return BLORP_FILTER_NONE; + } +} + +static void +cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer, + const struct anv_attachment *att, + VkImageLayout layout, + VkImageAspectFlagBits aspect) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + const struct anv_image_view *src_iview = att->iview; + const struct anv_image_view *dst_iview = att->resolve_iview; + + enum isl_aux_usage src_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, + src_iview->image, aspect, + VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + layout); + + enum isl_aux_usage dst_aux_usage = + anv_layout_to_aux_usage(cmd_buffer->device->info, + dst_iview->image, aspect, + VK_IMAGE_USAGE_TRANSFER_DST_BIT, + att->resolve_layout); + + enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode); + + const VkRect2D render_area = gfx->render_area; + if (gfx->view_mask == 0) { + anv_image_msaa_resolve(cmd_buffer, + src_iview->image, src_aux_usage, + src_iview->planes[0].isl.base_level, + src_iview->planes[0].isl.base_array_layer, + dst_iview->image, dst_aux_usage, + dst_iview->planes[0].isl.base_level, + dst_iview->planes[0].isl.base_array_layer, + aspect, + render_area.offset.x, render_area.offset.y, + render_area.offset.x, render_area.offset.y, + render_area.extent.width, + render_area.extent.height, + gfx->layer_count, filter); + } else { + uint32_t res_view_mask = gfx->view_mask; + while (res_view_mask) { + int i = u_bit_scan(&res_view_mask); + + anv_image_msaa_resolve(cmd_buffer, + src_iview->image, src_aux_usage, + src_iview->planes[0].isl.base_level, + src_iview->planes[0].isl.base_array_layer + i, + dst_iview->image, dst_aux_usage, + dst_iview->planes[0].isl.base_level, + dst_iview->planes[0].isl.base_array_layer + i, + aspect, + render_area.offset.x, render_area.offset.y, + render_area.offset.x, render_area.offset.y, + render_area.extent.width, + render_area.extent.height, + 1, filter); + } + } +} + +void genX(CmdEndRendering)( + VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + const bool is_multiview = gfx->view_mask != 0; + const uint32_t layers = + is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count; + + bool has_color_resolve = false; + for (uint32_t i = 0; i < gfx->color_att_count; i++) { + if (gfx->color_att[i].iview == NULL) + continue; + + cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i], + VK_IMAGE_ASPECT_COLOR_BIT); + + /* Stash this off for later */ + if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE && + !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) + has_color_resolve = true; + } + + if (gfx->depth_att.iview != NULL) { + cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att, + VK_IMAGE_ASPECT_DEPTH_BIT); + } + + if (gfx->stencil_att.iview != NULL) { + cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att, + VK_IMAGE_ASPECT_STENCIL_BIT); + } + + if (has_color_resolve) { + /* We are about to do some MSAA resolves. We need to flush so that the + * result of writes to the MSAA color attachments show up in the sampler + * when we blit to the single-sampled resolve target. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "MSAA resolve"); + } + + if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE || + gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) { + /* We are about to do some MSAA resolves. We need to flush so that the + * result of writes to the MSAA depth attachments show up in the sampler + * when we blit to the single-sampled resolve target. + */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | + ANV_PIPE_DEPTH_CACHE_FLUSH_BIT, + "MSAA resolve"); + } + + for (uint32_t i = 0; i < gfx->color_att_count; i++) { + const struct anv_attachment *att = &gfx->color_att[i]; + if (att->resolve_mode == VK_RESOLVE_MODE_NONE || + (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) + continue; + + cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout, + VK_IMAGE_ASPECT_COLOR_BIT); + } + + if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE && + !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) { + const struct anv_image_view *src_iview = gfx->depth_att.iview; + + /* MSAA resolves sample from the source attachment. Transition the + * depth attachment first to get rid of any HiZ that we may not be + * able to handle. + */ + transition_depth_buffer(cmd_buffer, src_iview->image, + src_iview->planes[0].isl.base_array_layer, + layers, + gfx->depth_att.layout, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + false /* will_full_fast_clear */); + + cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_IMAGE_ASPECT_DEPTH_BIT); + + /* Transition the source back to the original layout. This seems a bit + * inefficient but, since HiZ resolves aren't destructive, going from + * less HiZ to more is generally a no-op. + */ + transition_depth_buffer(cmd_buffer, src_iview->image, + src_iview->planes[0].isl.base_array_layer, + layers, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + gfx->depth_att.layout, + false /* will_full_fast_clear */); + } + + if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE && + !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) { + cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att, + gfx->stencil_att.layout, + VK_IMAGE_ASPECT_STENCIL_BIT); + } + +#if GFX_VER == 7 + /* On gfx7, we have to store a texturable version of the stencil buffer in + * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and + * forth at strategic points. Stencil writes are only allowed in following + * layouts: + * + * - VK_IMAGE_LAYOUT_GENERAL + * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL + * - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT + * + * For general, we have no nice opportunity to transition so we do the copy + * to the shadow unconditionally at the end of the subpass. For transfer + * destinations, we can update it as part of the transfer op. For the other + * layouts, we delay the copy until a transition into some other layout. + */ + if (gfx->stencil_att.iview != NULL) { + const struct anv_image_view *iview = gfx->stencil_att.iview; + const struct anv_image *image = iview->image; + const uint32_t plane = + anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT); + + if (anv_surface_is_valid(&image->planes[plane].shadow_surface) && + (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL || + gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) { + anv_image_copy_to_shadow(cmd_buffer, image, + VK_IMAGE_ASPECT_STENCIL_BIT, + iview->planes[plane].isl.base_level, 1, + iview->planes[plane].isl.base_array_layer, + layers); + } + } +#endif + + anv_cmd_buffer_reset_rendering(cmd_buffer); +} + +void +genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer) +{ +#if GFX_VERx10 >= 75 + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), + mi_reg32(ANV_PREDICATE_RESULT_REG)); + mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0)); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOADINV; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } +#endif +} + +#if GFX_VERx10 >= 75 +void genX(CmdBeginConditionalRenderingEXT)( + VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + struct anv_address value_address = + anv_address_add(buffer->address, pConditionalRenderingBegin->offset); + + const bool isInverted = pConditionalRenderingBegin->flags & + VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT; + + cmd_state->conditional_render_enabled = true; + + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + /* Section 19.4 of the Vulkan 1.1.85 spec says: + * + * If the value of the predicate in buffer memory changes + * while conditional rendering is active, the rendering commands + * may be discarded in an implementation-dependent way. + * Some implementations may latch the value of the predicate + * upon beginning conditional rendering while others + * may read it before every rendering command. + * + * So it's perfectly fine to read a value from the buffer once. + */ + struct mi_value value = mi_mem32(value_address); + + /* Precompute predicate result, it is necessary to support secondary + * command buffers since it is unknown if conditional rendering is + * inverted when populating them. + */ + mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG), + isInverted ? mi_uge(&b, mi_imm(0), value) : + mi_ult(&b, mi_imm(0), value)); +} + +void genX(CmdEndConditionalRenderingEXT)( + VkCommandBuffer commandBuffer) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + struct anv_cmd_state *cmd_state = &cmd_buffer->state; + + cmd_state->conditional_render_enabled = false; +} +#endif + +/* Set of stage bits for which are pipelined, i.e. they get queued + * by the command streamer for later execution. + */ +#define ANV_PIPELINE_STAGE_PIPELINED_BITS \ + ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \ + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \ + VK_PIPELINE_STAGE_2_HOST_BIT | \ + VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT) + +void genX(CmdSetEvent2)( + VkCommandBuffer commandBuffer, + VkEvent _event, + const VkDependencyInfo* pDependencyInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + VkPipelineStageFlags2 src_stages = 0; + + for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) + src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask; + for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) + src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask; + for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) + src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask; + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT, + pc.PostSyncOperation = WriteImmediateData, + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_SET; + anv_debug_dump_pc(pc); + } +} + +void genX(CmdResetEvent2)( + VkCommandBuffer commandBuffer, + VkEvent _event, + VkPipelineStageFlags2 stageMask) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_event, event, _event); + + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) { + pc.StallAtPixelScoreboard = true; + pc.CommandStreamerStallEnable = true; + } + + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + pc.ImmediateData = VK_EVENT_RESET; + anv_debug_dump_pc(pc); + } +} + +void genX(CmdWaitEvents2)( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + const VkDependencyInfo* pDependencyInfos) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + +#if GFX_VER >= 8 + for (uint32_t i = 0; i < eventCount; i++) { + ANV_FROM_HANDLE(anv_event, event, pEvents[i]); + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode, + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD, + sem.SemaphoreDataDword = VK_EVENT_SET, + sem.SemaphoreAddress = (struct anv_address) { + cmd_buffer->device->dynamic_state_pool.block_pool.bo, + event->state.offset + }; + } + } +#else + anv_finishme("Implement events on gfx7"); +#endif + + cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event"); +} + +static uint32_t vk_to_intel_index_type(VkIndexType type) +{ + switch (type) { + case VK_INDEX_TYPE_UINT8_EXT: + return INDEX_BYTE; + case VK_INDEX_TYPE_UINT16: + return INDEX_WORD; + case VK_INDEX_TYPE_UINT32: + return INDEX_DWORD; + default: + unreachable("invalid index type"); + } +} + +static uint32_t restart_index_for_type(VkIndexType type) +{ + switch (type) { + case VK_INDEX_TYPE_UINT8_EXT: + return UINT8_MAX; + case VK_INDEX_TYPE_UINT16: + return UINT16_MAX; + case VK_INDEX_TYPE_UINT32: + return UINT32_MAX; + default: + unreachable("invalid index type"); + } +} + +void genX(CmdBindIndexBuffer)( + VkCommandBuffer commandBuffer, + VkBuffer _buffer, + VkDeviceSize offset, + VkIndexType indexType) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_buffer, buffer, _buffer); + + cmd_buffer->state.gfx.restart_index = restart_index_for_type(indexType); + cmd_buffer->state.gfx.index_buffer = buffer; + cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType); + cmd_buffer->state.gfx.index_offset = offset; + + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER; +} + +VkResult genX(CmdSetPerformanceOverrideINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceOverrideInfoINTEL* pOverrideInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + switch (pOverrideInfo->type) { + case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { +#if GFX_VER >= 9 + anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) { + csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable; + csdm2.MediaInstructionDisable = pOverrideInfo->enable; + csdm2._3DRenderingInstructionDisableMask = true; + csdm2.MediaInstructionDisableMask = true; + } +#else + anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) { + instpm._3DRenderingInstructionDisable = pOverrideInfo->enable; + instpm.MediaInstructionDisable = pOverrideInfo->enable; + instpm._3DRenderingInstructionDisableMask = true; + instpm.MediaInstructionDisableMask = true; + } +#endif + break; + } + + case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: + if (pOverrideInfo->enable) { + /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS, + "perf counter isolation"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + break; + + default: + unreachable("Invalid override"); + } + + return VK_SUCCESS; +} + +VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) +{ + /* TODO: Waiting on the register to write, might depend on generation. */ + + return VK_SUCCESS; +} + +#define TIMESTAMP 0x2358 + +void genX(cmd_emit_timestamp)(struct anv_batch *batch, + struct anv_device *device, + struct anv_address addr, + bool end_of_pipe) { + if (end_of_pipe) { + anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { + pc.PostSyncOperation = WriteTimestamp; + pc.Address = addr; + anv_debug_dump_pc(pc); + } + } else { + struct mi_builder b; + mi_builder_init(&b, device->info, batch); + mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP)); + } +} diff --git a/src/intel/vulkan_hasvk/genX_gpu_memcpy.c b/src/intel/vulkan_hasvk/genX_gpu_memcpy.c new file mode 100644 index 00000000000..3468137b0a8 --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_gpu_memcpy.c @@ -0,0 +1,324 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +#include "common/intel_l3_config.h" + +/** + * This file implements some lightweight memcpy/memset operations on the GPU + * using a vertex buffer and streamout. + */ + +/** + * Returns the greatest common divisor of a and b that is a power of two. + */ +static uint64_t +gcd_pow2_u64(uint64_t a, uint64_t b) +{ + assert(a > 0 || b > 0); + + unsigned a_log2 = ffsll(a) - 1; + unsigned b_log2 = ffsll(b) - 1; + + /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which + * case, the MIN2() will take the other one. If both are 0 then we will + * hit the assert above. + */ + return 1 << MIN2(a_log2, b_log2); +} + +static void +emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device, + const struct intel_l3_config *l3_config) +{ +#if GFX_VER >= 8 + anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 0; + } + anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs); +#endif + + /* Disable all shader stages */ + anv_batch_emit(batch, GENX(3DSTATE_VS), vs); + anv_batch_emit(batch, GENX(3DSTATE_HS), hs); + anv_batch_emit(batch, GENX(3DSTATE_TE), te); + anv_batch_emit(batch, GENX(3DSTATE_DS), DS); + anv_batch_emit(batch, GENX(3DSTATE_GS), gs); + anv_batch_emit(batch, GENX(3DSTATE_PS), gs); + + anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) { + sbe.VertexURBEntryReadOffset = 1; + sbe.NumberofSFOutputAttributes = 1; + sbe.VertexURBEntryReadLength = 1; +#if GFX_VER >= 8 + sbe.ForceVertexURBEntryReadLength = true; + sbe.ForceVertexURBEntryReadOffset = true; +#endif + +#if GFX_VER >= 9 + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; +#endif + } + + /* Emit URB setup. We tell it that the VS is active because we want it to + * allocate space for the VS. Even though one isn't run, we need VUEs to + * store the data that VF is going to pass to SOL. + */ + const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 }; + + genX(emit_urb_setup)(device, batch, l3_config, + VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL); + +#if GFX_VER >= 12 + /* Disable Primitive Replication. */ + anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); +#endif + +#if GFX_VER >= 8 + anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { + topo.PrimitiveTopologyType = _3DPRIM_POINTLIST; + } +#endif + + anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) { + vf.StatisticsEnable = false; + } +} + +static void +emit_so_memcpy(struct anv_batch *batch, struct anv_device *device, + struct anv_address dst, struct anv_address src, + uint32_t size) +{ + /* The maximum copy block size is 4 32-bit components at a time. */ + assert(size % 4 == 0); + unsigned bs = gcd_pow2_u64(16, size); + + enum isl_format format; + switch (bs) { + case 4: format = ISL_FORMAT_R32_UINT; break; + case 8: format = ISL_FORMAT_R32G32_UINT; break; + case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break; + default: + unreachable("Invalid size"); + } + + uint32_t *dw; + dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS)); + GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1, + &(struct GENX(VERTEX_BUFFER_STATE)) { + .VertexBufferIndex = 32, /* Reserved for this */ + .AddressModifyEnable = true, + .BufferStartingAddress = src, + .BufferPitch = bs, + .MOCS = anv_mocs(device, src.bo, 0), +#if GFX_VER >= 12 + .L3BypassDisable = true, +#endif +#if (GFX_VER >= 8) + .BufferSize = size, +#else + .EndAddress = anv_address_add(src, size - 1), +#endif + }); + + dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS)); + GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1, + &(struct GENX(VERTEX_ELEMENT_STATE)) { + .VertexBufferIndex = 32, + .Valid = true, + .SourceElementFormat = format, + .SourceElementOffset = 0, + .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, + .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, + .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, + .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, + }); + + + anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) { +#if GFX_VER < 12 + sob.SOBufferIndex = 0; +#else + sob._3DCommandOpcode = 0; + sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD; +#endif + sob.MOCS = anv_mocs(device, dst.bo, 0), + sob.SurfaceBaseAddress = dst; + +#if GFX_VER >= 8 + sob.SOBufferEnable = true; + sob.SurfaceSize = size / 4 - 1; +#else + sob.SurfacePitch = bs; + sob.SurfaceEndAddress = anv_address_add(dst, size); +#endif + +#if GFX_VER >= 8 + /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with + * the end position of the stream. We need to reset this value to 0 at + * the beginning of the run or else SOL will start at the offset from + * the previous draw. + */ + sob.StreamOffsetWriteEnable = true; + sob.StreamOffset = 0; +#endif + } + +#if GFX_VER <= 7 + /* The hardware can do this for us on BDW+ (see above) */ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), load) { + load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num); + load.DataDWord = 0; + } +#endif + + dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST), + .StreamtoBufferSelects0 = (1 << 0), + .NumEntries0 = 1); + GENX(SO_DECL_ENTRY_pack)(batch, dw + 3, + &(struct GENX(SO_DECL_ENTRY)) { + .Stream0Decl = { + .OutputBufferSlot = 0, + .RegisterIndex = 0, + .ComponentMask = (1 << (bs / 4)) - 1, + }, + }); + + anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) { + so.SOFunctionEnable = true; + so.RenderingDisable = true; + so.Stream0VertexReadOffset = 0; + so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64); +#if GFX_VER >= 8 + so.Buffer0SurfacePitch = bs; +#else + so.SOBufferEnable0 = true; +#endif + } + + anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; + prim.VertexCountPerInstance = size / bs; + prim.StartVertexLocation = 0; + prim.InstanceCount = 1; + prim.StartInstanceLocation = 0; + prim.BaseVertexLocation = 0; + } +} + +void +genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, + struct anv_device *device, + struct anv_batch *batch) +{ + memset(state, 0, sizeof(*state)); + + state->batch = batch; + state->device = device; + + const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info); + genX(emit_l3_config)(batch, device, cfg); + + anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) { +#if GFX_VER >= 9 + ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; + ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; +#endif + ps.PipelineSelection = _3D; + } + + emit_common_so_memcpy(batch, device, cfg); +} + +void +genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state) +{ + genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, + ANV_PIPE_END_OF_PIPE_SYNC_BIT); + + anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end); + + if ((state->batch->next - state->batch->start) & 4) + anv_batch_emit(state->batch, GENX(MI_NOOP), noop); +} + +void +genX(emit_so_memcpy)(struct anv_memcpy_state *state, + struct anv_address dst, struct anv_address src, + uint32_t size) +{ + if (GFX_VER >= 8 && GFX_VER <= 9 && + !anv_use_relocations(state->device->physical) && + anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound, + &state->vb_dirty, + src, size)) { + genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, + ANV_PIPE_CS_STALL_BIT | + ANV_PIPE_VF_CACHE_INVALIDATE_BIT); + memset(&state->vb_dirty, 0, sizeof(state->vb_dirty)); + } + + emit_so_memcpy(state->batch, state->device, dst, src, size); +} + +void +genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, + struct anv_address dst, struct anv_address src, + uint32_t size) +{ + if (size == 0) + return; + + if (!cmd_buffer->state.current_l3_config) { + const struct intel_l3_config *cfg = + intel_get_default_l3_config(cmd_buffer->device->info); + genX(cmd_buffer_config_l3)(cmd_buffer, cfg); + } + + genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + genX(flush_pipeline_select_3d)(cmd_buffer); + + emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, + cmd_buffer->state.current_l3_config); + emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size); + + genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL, + 1ull << 32); + + /* Invalidate pipeline & raster discard since we touch + * 3DSTATE_STREAMOUT. + */ + cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; + BITSET_SET(cmd_buffer->vk.dynamic_graphics_state.dirty, + MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); +} diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c new file mode 100644 index 00000000000..a28f34a0efa --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_pipeline.c @@ -0,0 +1,2563 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" +#include "genxml/gen_rt_pack.h" + +#include "common/intel_l3_config.h" +#include "common/intel_sample_positions.h" +#include "nir/nir_xfb_info.h" +#include "vk_util.h" +#include "vk_format.h" +#include "vk_log.h" +#include "vk_render_pass.h" + +static uint32_t +vertex_element_comp_control(enum isl_format format, unsigned comp) +{ + uint8_t bits; + switch (comp) { + case 0: bits = isl_format_layouts[format].channels.r.bits; break; + case 1: bits = isl_format_layouts[format].channels.g.bits; break; + case 2: bits = isl_format_layouts[format].channels.b.bits; break; + case 3: bits = isl_format_layouts[format].channels.a.bits; break; + default: unreachable("Invalid component"); + } + + /* + * Take in account hardware restrictions when dealing with 64-bit floats. + * + * From Broadwell spec, command reference structures, page 586: + * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats, + * 64-bit components are stored * in the URB without any conversion. In + * this case, vertex elements must be written as 128 or 256 bits, with + * VFCOMP_STORE_0 being used to pad the output as required. E.g., if + * R64_PASSTHRU is used to copy a 64-bit Red component into the URB, + * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3 + * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or + * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output + * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires + * Component 3 to be specified as VFCOMP_STORE_0 in order to output a + * 256-bit vertex element." + */ + if (bits) { + return VFCOMP_STORE_SRC; + } else if (comp >= 2 && + !isl_format_layouts[format].channels.b.bits && + isl_format_layouts[format].channels.r.type == ISL_RAW) { + /* When emitting 64-bit attributes, we need to write either 128 or 256 + * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and + * VFCOMP_STORE_0 to pad the written chunk */ + return VFCOMP_NOSTORE; + } else if (comp < 3 || + isl_format_layouts[format].channels.r.type == ISL_RAW) { + /* Note we need to pad with value 0, not 1, due hardware restrictions + * (see comment above) */ + return VFCOMP_STORE_0; + } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || + isl_format_layouts[format].channels.r.type == ISL_SINT) { + assert(comp == 3); + return VFCOMP_STORE_1_INT; + } else { + assert(comp == 3); + return VFCOMP_STORE_1_FP; + } +} + +static void +emit_vertex_input(struct anv_graphics_pipeline *pipeline, + const struct vk_vertex_input_state *vi) +{ + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + + /* Pull inputs_read out of the VS prog data */ + const uint64_t inputs_read = vs_prog_data->inputs_read; + const uint64_t double_inputs_read = + vs_prog_data->double_inputs_read & inputs_read; + assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); + const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; + const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0; + const bool needs_svgs_elem = vs_prog_data->uses_vertexid || + vs_prog_data->uses_instanceid || + vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance; + + uint32_t elem_count = __builtin_popcount(elements) - + __builtin_popcount(elements_double) / 2; + + const uint32_t total_elems = + MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid); + + uint32_t *p; + + const uint32_t num_dwords = 1 + total_elems * 2; + p = anv_batch_emitn(&pipeline->base.batch, num_dwords, + GENX(3DSTATE_VERTEX_ELEMENTS)); + if (!p) + return; + + for (uint32_t i = 0; i < total_elems; i++) { + /* The SKL docs for VERTEX_ELEMENT_STATE say: + * + * "All elements must be valid from Element[0] to the last valid + * element. (I.e. if Element[2] is valid then Element[1] and + * Element[0] must also be valid)." + * + * The SKL docs for 3D_Vertex_Component_Control say: + * + * "Don't store this component. (Not valid for Component 0, but can + * be used for Component 1-3)." + * + * So we can't just leave a vertex element blank and hope for the best. + * We have to tell the VF hardware to put something in it; so we just + * store a bunch of zero. + * + * TODO: Compact vertex elements so we never end up with holes. + */ + struct GENX(VERTEX_ELEMENT_STATE) element = { + .Valid = true, + .Component0Control = VFCOMP_STORE_0, + .Component1Control = VFCOMP_STORE_0, + .Component2Control = VFCOMP_STORE_0, + .Component3Control = VFCOMP_STORE_0, + }; + GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element); + } + + u_foreach_bit(a, vi->attributes_valid) { + enum isl_format format = anv_get_isl_format(pipeline->base.device->info, + vi->attributes[a].format, + VK_IMAGE_ASPECT_COLOR_BIT, + VK_IMAGE_TILING_LINEAR); + + uint32_t binding = vi->attributes[a].binding; + assert(binding < MAX_VBS); + + if ((elements & (1 << a)) == 0) + continue; /* Binding unused */ + + uint32_t slot = + __builtin_popcount(elements & ((1 << a) - 1)) - + DIV_ROUND_UP(__builtin_popcount(elements_double & + ((1 << a) -1)), 2); + + struct GENX(VERTEX_ELEMENT_STATE) element = { + .VertexBufferIndex = vi->attributes[a].binding, + .Valid = true, + .SourceElementFormat = format, + .EdgeFlagEnable = false, + .SourceElementOffset = vi->attributes[a].offset, + .Component0Control = vertex_element_comp_control(format, 0), + .Component1Control = vertex_element_comp_control(format, 1), + .Component2Control = vertex_element_comp_control(format, 2), + .Component3Control = vertex_element_comp_control(format, 3), + }; + GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element); + +#if GFX_VER >= 8 + /* On Broadwell and later, we have a separate VF_INSTANCING packet + * that controls instancing. On Haswell and prior, that's part of + * VERTEX_BUFFER_STATE which we emit later. + */ + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + bool per_instance = pipeline->vb[binding].instanced; + uint32_t divisor = pipeline->vb[binding].instance_divisor * + pipeline->instance_multiplier; + + vfi.InstancingEnable = per_instance; + vfi.VertexElementIndex = slot; + vfi.InstanceDataStepRate = per_instance ? divisor : 1; + } +#endif + } + + const uint32_t id_slot = elem_count; + if (needs_svgs_elem) { + /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: + * "Within a VERTEX_ELEMENT_STATE structure, if a Component + * Control field is set to something other than VFCOMP_STORE_SRC, + * no higher-numbered Component Control fields may be set to + * VFCOMP_STORE_SRC" + * + * This means, that if we have BaseInstance, we need BaseVertex as + * well. Just do all or nothing. + */ + uint32_t base_ctrl = (vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance) ? + VFCOMP_STORE_SRC : VFCOMP_STORE_0; + + struct GENX(VERTEX_ELEMENT_STATE) element = { + .VertexBufferIndex = ANV_SVGS_VB_INDEX, + .Valid = true, + .SourceElementFormat = ISL_FORMAT_R32G32_UINT, + .Component0Control = base_ctrl, + .Component1Control = base_ctrl, +#if GFX_VER >= 8 + .Component2Control = VFCOMP_STORE_0, + .Component3Control = VFCOMP_STORE_0, +#else + .Component2Control = VFCOMP_STORE_VID, + .Component3Control = VFCOMP_STORE_IID, +#endif + }; + GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element); + +#if GFX_VER >= 8 + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.VertexElementIndex = id_slot; + } +#endif + } + +#if GFX_VER >= 8 + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) { + sgvs.VertexIDEnable = vs_prog_data->uses_vertexid; + sgvs.VertexIDComponentNumber = 2; + sgvs.VertexIDElementOffset = id_slot; + sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid; + sgvs.InstanceIDComponentNumber = 3; + sgvs.InstanceIDElementOffset = id_slot; + } +#endif + + const uint32_t drawid_slot = elem_count + needs_svgs_elem; + if (vs_prog_data->uses_drawid) { + struct GENX(VERTEX_ELEMENT_STATE) element = { + .VertexBufferIndex = ANV_DRAWID_VB_INDEX, + .Valid = true, + .SourceElementFormat = ISL_FORMAT_R32_UINT, + .Component0Control = VFCOMP_STORE_SRC, + .Component1Control = VFCOMP_STORE_0, + .Component2Control = VFCOMP_STORE_0, + .Component3Control = VFCOMP_STORE_0, + }; + GENX(VERTEX_ELEMENT_STATE_pack)(NULL, + &p[1 + drawid_slot * 2], + &element); + +#if GFX_VER >= 8 + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.VertexElementIndex = drawid_slot; + } +#endif + } +} + +void +genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, + const struct intel_l3_config *l3_config, + VkShaderStageFlags active_stages, + const unsigned entry_size[4], + enum intel_urb_deref_block_size *deref_block_size) +{ + const struct intel_device_info *devinfo = device->info; + + unsigned entries[4]; + unsigned start[4]; + bool constrained; + intel_get_urb_config(devinfo, l3_config, + active_stages & + VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, + active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, + entry_size, entries, start, deref_block_size, + &constrained); + +#if GFX_VERx10 == 70 + /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: + * + * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall + * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, + * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, + * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL + * needs to be sent before any combination of VS associated 3DSTATE." + */ + anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) { + pc.DepthStallEnable = true; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = device->workaround_address; + } +#endif + + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = start[i]; + urb.VSURBEntryAllocationSize = entry_size[i] - 1; + urb.VSNumberofURBEntries = entries[i]; + } + } +#if GFX_VERx10 >= 125 + if (device->physical->vk.supported_extensions.NV_mesh_shader) { + anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero); + anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero); + } +#endif +} + +#if GFX_VERx10 >= 125 +static void +emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline, + enum intel_urb_deref_block_size *deref_block_size) +{ + const struct intel_device_info *devinfo = pipeline->base.device->info; + + const struct brw_task_prog_data *task_prog_data = + anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ? + get_task_prog_data(pipeline) : NULL; + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + + const struct intel_mesh_urb_allocation alloc = + intel_get_mesh_urb_config(devinfo, pipeline->base.l3_config, + task_prog_data ? task_prog_data->map.size_dw : 0, + mesh_prog_data->map.size_dw); + + /* Zero out the primitive pipeline URB allocations. */ + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + } + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) { + if (task_prog_data) { + urb.TASKURBEntryAllocationSize = alloc.task_entry_size_64b - 1; + urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries; + urb.TASKNumberofURBEntriesSliceN = alloc.task_entries; + urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb; + urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb; + } + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) { + urb.MESHURBEntryAllocationSize = alloc.mesh_entry_size_64b - 1; + urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries; + urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries; + urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb; + urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb; + } + + *deref_block_size = alloc.deref_block_size; +} +#endif + +static void +emit_urb_setup(struct anv_graphics_pipeline *pipeline, + enum intel_urb_deref_block_size *deref_block_size) +{ +#if GFX_VERx10 >= 125 + if (anv_pipeline_is_mesh(pipeline)) { + emit_urb_setup_mesh(pipeline, deref_block_size); + return; + } +#endif + + unsigned entry_size[4]; + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + const struct brw_vue_prog_data *prog_data = + !anv_pipeline_has_stage(pipeline, i) ? NULL : + (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data; + + entry_size[i] = prog_data ? prog_data->urb_entry_size : 1; + } + + genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch, + pipeline->base.l3_config, + pipeline->active_stages, entry_size, + deref_block_size); +} + +static void +emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline) +{ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe); +#if GFX_VER >= 8 + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe); +#endif +#if GFX_VERx10 >= 125 + if (anv_pipeline_is_mesh(pipeline)) + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh); +#endif + return; + } + + struct GENX(3DSTATE_SBE) sbe = { + GENX(3DSTATE_SBE_header), + /* TODO(mesh): Figure out cases where we need attribute swizzling. See also + * calculate_urb_setup() and related functions. + */ + .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline), + .PointSpriteTextureCoordinateOrigin = UPPERLEFT, + .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs, + .ConstantInterpolationEnable = wm_prog_data->flat_inputs, + }; + +#if GFX_VER >= 9 + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; +#endif + +#if GFX_VER >= 8 + /* On Broadwell, they broke 3DSTATE_SBE into two packets */ + struct GENX(3DSTATE_SBE_SWIZ) swiz = { + GENX(3DSTATE_SBE_SWIZ_header), + }; +#else +# define swiz sbe +#endif + + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_map *fs_input_map = + &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; + + int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs, + fs_input_map); + assert(first_slot % 2 == 0); + unsigned urb_entry_read_offset = first_slot / 2; + int max_source_attr = 0; + for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { + uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; + int input_index = wm_prog_data->urb_setup[attr]; + + assert(0 <= input_index); + + /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the + * VUE header + */ + if (attr == VARYING_SLOT_VIEWPORT || + attr == VARYING_SLOT_LAYER || + attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { + continue; + } + + if (attr == VARYING_SLOT_PNTC) { + sbe.PointSpriteTextureCoordinateEnable = 1 << input_index; + continue; + } + + const int slot = fs_input_map->varying_to_slot[attr]; + + if (slot == -1) { + /* This attribute does not exist in the VUE--that means that the + * vertex shader did not write to it. It could be that it's a + * regular varying read by the fragment shader but not written by + * the vertex shader or it's gl_PrimitiveID. In the first case the + * value is undefined, in the second it needs to be + * gl_PrimitiveID. + */ + swiz.Attribute[input_index].ConstantSource = PRIM_ID; + swiz.Attribute[input_index].ComponentOverrideX = true; + swiz.Attribute[input_index].ComponentOverrideY = true; + swiz.Attribute[input_index].ComponentOverrideZ = true; + swiz.Attribute[input_index].ComponentOverrideW = true; + continue; + } + + /* We have to subtract two slots to account for the URB entry output + * read offset in the VS and GS stages. + */ + const int source_attr = slot - 2 * urb_entry_read_offset; + assert(source_attr >= 0 && source_attr < 32); + max_source_attr = MAX2(max_source_attr, source_attr); + /* The hardware can only do overrides on 16 overrides at a time, and the + * other up to 16 have to be lined up so that the input index = the + * output index. We'll need to do some tweaking to make sure that's the + * case. + */ + if (input_index < 16) + swiz.Attribute[input_index].SourceAttribute = source_attr; + else + assert(source_attr == input_index); + } + + sbe.VertexURBEntryReadOffset = urb_entry_read_offset; + sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); +#if GFX_VER >= 8 + sbe.ForceVertexURBEntryReadOffset = true; + sbe.ForceVertexURBEntryReadLength = true; +#endif + } else { + assert(anv_pipeline_is_mesh(pipeline)); +#if GFX_VERx10 >= 125 + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) { + const struct brw_mue_map *mue = &mesh_prog_data->map; + + assert(mue->per_vertex_header_size_dw % 8 == 0); + sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8; + sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8); + + /* Clip distance array is passed in the per-vertex header so that + * it can be consumed by the HW. If user wants to read it in the FS, + * adjust the offset and length to cover it. Conveniently it is at + * the end of the per-vertex header, right before per-vertex + * attributes. + * + * Note that FS attribute reading must be aware that the clip + * distances have fixed position. + */ + if (mue->per_vertex_header_size_dw > 8 && + (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) { + sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1; + sbe_mesh.PerVertexURBEntryOutputReadLength += 1; + } + + assert(mue->per_primitive_header_size_dw % 8 == 0); + sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8; + sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8); + + /* Just like with clip distances, if Primitive Shading Rate, + * Viewport Index or Layer is read back in the FS, adjust + * the offset and length to cover the Primitive Header, where + * PSR, Viewport Index & Layer are stored. + */ + if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 || + wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) { + assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0); + sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1; + sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1; + } + } +#endif + } + + uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch, + GENX(3DSTATE_SBE_length)); + if (!dw) + return; + GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe); + +#if GFX_VER >= 8 + dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length)); + if (!dw) + return; + GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz); +#endif +} + +/** Returns the final polygon mode for rasterization + * + * This function takes into account polygon mode, primitive topology and the + * different shader stages which might generate their own type of primitives. + */ +VkPolygonMode +genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline, + VkPrimitiveTopology primitive_topology) +{ + if (anv_pipeline_is_mesh(pipeline)) { + switch (get_mesh_prog_data(pipeline)->primitive_type) { + case SHADER_PRIM_POINTS: + return VK_POLYGON_MODE_POINT; + case SHADER_PRIM_LINES: + return VK_POLYGON_MODE_LINE; + case SHADER_PRIM_TRIANGLES: + return pipeline->polygon_mode; + default: + unreachable("invalid primitive type for mesh"); + } + } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { + switch (get_gs_prog_data(pipeline)->output_topology) { + case _3DPRIM_POINTLIST: + return VK_POLYGON_MODE_POINT; + + case _3DPRIM_LINELIST: + case _3DPRIM_LINESTRIP: + case _3DPRIM_LINELOOP: + return VK_POLYGON_MODE_LINE; + + case _3DPRIM_TRILIST: + case _3DPRIM_TRIFAN: + case _3DPRIM_TRISTRIP: + case _3DPRIM_RECTLIST: + case _3DPRIM_QUADLIST: + case _3DPRIM_QUADSTRIP: + case _3DPRIM_POLYGON: + return pipeline->polygon_mode; + } + unreachable("Unsupported GS output topology"); + } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + switch (get_tes_prog_data(pipeline)->output_topology) { + case BRW_TESS_OUTPUT_TOPOLOGY_POINT: + return VK_POLYGON_MODE_POINT; + + case BRW_TESS_OUTPUT_TOPOLOGY_LINE: + return VK_POLYGON_MODE_LINE; + + case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW: + case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW: + return pipeline->polygon_mode; + } + unreachable("Unsupported TCS output topology"); + } else { + switch (primitive_topology) { + case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: + return VK_POLYGON_MODE_POINT; + + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP: + case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY: + return VK_POLYGON_MODE_LINE; + + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY: + case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY: + return pipeline->polygon_mode; + + default: + unreachable("Unsupported primitive topology"); + } + } +} + +uint32_t +genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline, + VkPolygonMode raster_mode) +{ +#if GFX_VER <= 7 + if (raster_mode == VK_POLYGON_MODE_LINE) { + switch (pipeline->line_mode) { + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: + return MSRASTMODE_ON_PATTERN; + + case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: + return MSRASTMODE_OFF_PIXEL; + + default: + unreachable("Unsupported line rasterization mode"); + } + } else { + return pipeline->rasterization_samples > 1 ? + MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL; + } +#else + unreachable("Only on gen7"); +#endif +} + +const uint32_t genX(vk_to_intel_cullmode)[] = { + [VK_CULL_MODE_NONE] = CULLMODE_NONE, + [VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT, + [VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK, + [VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH +}; + +const uint32_t genX(vk_to_intel_fillmode)[] = { + [VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID, + [VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME, + [VK_POLYGON_MODE_POINT] = FILL_MODE_POINT, +}; + +const uint32_t genX(vk_to_intel_front_face)[] = { + [VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1, + [VK_FRONT_FACE_CLOCKWISE] = 0 +}; + +void +genX(rasterization_mode)(VkPolygonMode raster_mode, + VkLineRasterizationModeEXT line_mode, + float line_width, + uint32_t *api_mode, + bool *msaa_rasterization_enable) +{ +#if GFX_VER >= 8 + if (raster_mode == VK_POLYGON_MODE_LINE) { + /* Unfortunately, configuring our line rasterization hardware on gfx8 + * and later is rather painful. Instead of giving us bits to tell the + * hardware what line mode to use like we had on gfx7, we now have an + * arcane combination of API Mode and MSAA enable bits which do things + * in a table which are expected to magically put the hardware into the + * right mode for your API. Sadly, Vulkan isn't any of the APIs the + * hardware people thought of so nothing works the way you want it to. + * + * Look at the table titled "Multisample Rasterization Modes" in Vol 7 + * of the Skylake PRM for more details. + */ + switch (line_mode) { + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT: + *api_mode = DX100; +#if GFX_VER <= 9 + /* Prior to ICL, the algorithm the HW uses to draw wide lines + * doesn't quite match what the CTS expects, at least for rectangular + * lines, so we set this to false here, making it draw parallelograms + * instead, which work well enough. + */ + *msaa_rasterization_enable = line_width < 1.0078125; +#else + *msaa_rasterization_enable = true; +#endif + break; + + case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT: + case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT: + *api_mode = DX9OGL; + *msaa_rasterization_enable = false; + break; + + default: + unreachable("Unsupported line rasterization mode"); + } + } else { + *api_mode = DX100; + *msaa_rasterization_enable = true; + } +#else + unreachable("Invalid call"); +#endif +} + +static void +emit_rs_state(struct anv_graphics_pipeline *pipeline, + const struct vk_input_assembly_state *ia, + const struct vk_rasterization_state *rs, + const struct vk_multisample_state *ms, + const struct vk_render_pass_state *rp, + enum intel_urb_deref_block_size urb_deref_block_size) +{ + struct GENX(3DSTATE_SF) sf = { + GENX(3DSTATE_SF_header), + }; + + sf.ViewportTransformEnable = true; + sf.StatisticsEnable = true; + sf.VertexSubPixelPrecisionSelect = _8Bit; + sf.AALineDistanceMode = true; + + switch (rs->provoking_vertex) { + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: + sf.TriangleStripListProvokingVertexSelect = 0; + sf.LineStripListProvokingVertexSelect = 0; + sf.TriangleFanProvokingVertexSelect = 1; + break; + + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: + sf.TriangleStripListProvokingVertexSelect = 2; + sf.LineStripListProvokingVertexSelect = 1; + sf.TriangleFanProvokingVertexSelect = 2; + break; + + default: + unreachable("Invalid provoking vertex mode"); + } + +#if GFX_VERx10 == 75 + sf.LineStippleEnable = rs->line.stipple.enable; +#endif + +#if GFX_VER >= 12 + sf.DerefBlockSize = urb_deref_block_size; +#endif + + bool point_from_shader; + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_prog_data *last_vue_prog_data = + anv_pipeline_get_last_vue_prog_data(pipeline); + point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ; + } else { + assert(anv_pipeline_is_mesh(pipeline)); + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0; + } + + if (point_from_shader) { + sf.PointWidthSource = Vertex; + } else { + sf.PointWidthSource = State; + sf.PointWidth = 1.0; + } + +#if GFX_VER >= 8 + struct GENX(3DSTATE_RASTER) raster = { + GENX(3DSTATE_RASTER_header), + }; +#else +# define raster sf +#endif + + /* For details on 3DSTATE_RASTER multisample state, see the BSpec table + * "Multisample Modes State". + */ +#if GFX_VER >= 8 + /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix + * computations. If we ever set this bit to a different value, they will + * need to be updated accordingly. + */ + raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; + raster.ForceMultisampling = false; +#endif + + raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode]; + raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode]; + raster.ScissorRectangleEnable = true; + +#if GFX_VER >= 9 + /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */ + raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable; + raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable; +#elif GFX_VER >= 8 + raster.ViewportZClipTestEnable = pipeline->depth_clip_enable; +#endif + +#if GFX_VER >= 9 + raster.ConservativeRasterizationEnable = + rs->conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT; +#endif + +#if GFX_VER == 7 + /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it + * can get the depth offsets correct. + */ + if (rp != NULL && + rp->depth_attachment_format != VK_FORMAT_UNDEFINED) { + assert(vk_format_has_depth(rp->depth_attachment_format)); + enum isl_format isl_format = + anv_get_isl_format(pipeline->base.device->info, + rp->depth_attachment_format, + VK_IMAGE_ASPECT_DEPTH_BIT, + VK_IMAGE_TILING_OPTIMAL); + sf.DepthBufferSurfaceFormat = + isl_format_get_depth_format(isl_format, false); + } +#endif + +#if GFX_VER >= 8 + GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf); + GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster); +#else +# undef raster + GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf); +#endif +} + +static void +emit_ms_state(struct anv_graphics_pipeline *pipeline, + const struct vk_multisample_state *ms) +{ +#if GFX_VER >= 8 + /* On Gfx8+ 3DSTATE_MULTISAMPLE only holds the number of samples. */ + genX(emit_multisample)(&pipeline->base.batch, + pipeline->rasterization_samples, + NULL); +#endif + + /* From the Vulkan 1.0 spec: + * If pSampleMask is NULL, it is treated as if the mask has all bits + * enabled, i.e. no coverage is removed from fragments. + * + * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits. + */ +#if GFX_VER >= 8 + uint32_t sample_mask = 0xffff; +#else + uint32_t sample_mask = 0xff; +#endif + + if (ms != NULL) + sample_mask &= ms->sample_mask; + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) { + sm.SampleMask = sample_mask; + } +} + +const uint32_t genX(vk_to_intel_logic_op)[] = { + [VK_LOGIC_OP_COPY] = LOGICOP_COPY, + [VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR, + [VK_LOGIC_OP_AND] = LOGICOP_AND, + [VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE, + [VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED, + [VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP, + [VK_LOGIC_OP_XOR] = LOGICOP_XOR, + [VK_LOGIC_OP_OR] = LOGICOP_OR, + [VK_LOGIC_OP_NOR] = LOGICOP_NOR, + [VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV, + [VK_LOGIC_OP_INVERT] = LOGICOP_INVERT, + [VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE, + [VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED, + [VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED, + [VK_LOGIC_OP_NAND] = LOGICOP_NAND, + [VK_LOGIC_OP_SET] = LOGICOP_SET, +}; + +static const uint32_t vk_to_intel_blend[] = { + [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, + [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, + [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, + [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, + [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, + [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, + [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, + [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, + [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, + [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, + [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, + [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, + [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, +}; + +static const uint32_t vk_to_intel_blend_op[] = { + [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, + [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, + [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, + [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, + [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, +}; + +const uint32_t genX(vk_to_intel_compare_op)[] = { + [VK_COMPARE_OP_NEVER] = PREFILTEROP_NEVER, + [VK_COMPARE_OP_LESS] = PREFILTEROP_LESS, + [VK_COMPARE_OP_EQUAL] = PREFILTEROP_EQUAL, + [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LEQUAL, + [VK_COMPARE_OP_GREATER] = PREFILTEROP_GREATER, + [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_NOTEQUAL, + [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GEQUAL, + [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_ALWAYS, +}; + +const uint32_t genX(vk_to_intel_stencil_op)[] = { + [VK_STENCIL_OP_KEEP] = STENCILOP_KEEP, + [VK_STENCIL_OP_ZERO] = STENCILOP_ZERO, + [VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE, + [VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT, + [VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT, + [VK_STENCIL_OP_INVERT] = STENCILOP_INVERT, + [VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR, + [VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR, +}; + +const uint32_t genX(vk_to_intel_primitive_type)[] = { + [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = _3DPRIM_POINTLIST, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = _3DPRIM_LINELIST, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = _3DPRIM_LINESTRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = _3DPRIM_TRILIST, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = _3DPRIM_TRIFAN, + [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = _3DPRIM_LINELIST_ADJ, + [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = _3DPRIM_TRILIST_ADJ, + [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ, +}; + +static bool +is_dual_src_blend_factor(VkBlendFactor factor) +{ + return factor == VK_BLEND_FACTOR_SRC1_COLOR || + factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR || + factor == VK_BLEND_FACTOR_SRC1_ALPHA || + factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA; +} + +static inline uint32_t * +write_disabled_blend(uint32_t *state) +{ + struct GENX(BLEND_STATE_ENTRY) entry = { + .WriteDisableAlpha = true, + .WriteDisableRed = true, + .WriteDisableGreen = true, + .WriteDisableBlue = true, + }; + GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry); + return state + GENX(BLEND_STATE_ENTRY_length); +} + +static void +emit_cb_state(struct anv_graphics_pipeline *pipeline, + const struct vk_color_blend_state *cb, + const struct vk_multisample_state *ms) +{ + struct anv_device *device = pipeline->base.device; + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + struct GENX(BLEND_STATE) blend_state = { +#if GFX_VER >= 8 + .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable, + .AlphaToOneEnable = ms && ms->alpha_to_one_enable, +#endif + }; + + uint32_t surface_count = 0; + struct anv_pipeline_bind_map *map; + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map; + surface_count = map->surface_count; + } + + const struct intel_device_info *devinfo = pipeline->base.device->info; + uint32_t *blend_state_start = devinfo->ver >= 8 ? + pipeline->gfx8.blend_state : pipeline->gfx7.blend_state; + uint32_t *state_pos = blend_state_start; + + state_pos += GENX(BLEND_STATE_length); +#if GFX_VER >= 8 + struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 }; +#endif + for (unsigned i = 0; i < surface_count; i++) { + struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i]; + + /* All color attachments are at the beginning of the binding table */ + if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) + break; + + /* We can have at most 8 attachments */ + assert(i < MAX_RTS); + + if (cb == NULL || binding->index >= cb->attachment_count) { + state_pos = write_disabled_blend(state_pos); + continue; + } + + const struct vk_color_blend_attachment_state *a = + &cb->attachments[binding->index]; + + struct GENX(BLEND_STATE_ENTRY) entry = { +#if GFX_VER < 8 + .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable, + .AlphaToOneEnable = ms && ms->alpha_to_one_enable, +#endif + .LogicOpEnable = cb->logic_op_enable, + + /* Vulkan specification 1.2.168, VkLogicOp: + * + * "Logical operations are controlled by the logicOpEnable and + * logicOp members of VkPipelineColorBlendStateCreateInfo. If + * logicOpEnable is VK_TRUE, then a logical operation selected by + * logicOp is applied between each color attachment and the + * fragment’s corresponding output value, and blending of all + * attachments is treated as if it were disabled." + * + * From the Broadwell PRM Volume 2d: Command Reference: Structures: + * BLEND_STATE_ENTRY: + * + * "Enabling LogicOp and Color Buffer Blending at the same time is + * UNDEFINED" + */ + .ColorBufferBlendEnable = !cb->logic_op_enable && a->blend_enable, + .ColorClampRange = COLORCLAMP_RTFORMAT, + .PreBlendColorClampEnable = true, + .PostBlendColorClampEnable = true, + .SourceBlendFactor = vk_to_intel_blend[a->src_color_blend_factor], + .DestinationBlendFactor = vk_to_intel_blend[a->dst_color_blend_factor], + .ColorBlendFunction = vk_to_intel_blend_op[a->color_blend_op], + .SourceAlphaBlendFactor = vk_to_intel_blend[a->src_alpha_blend_factor], + .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dst_alpha_blend_factor], + .AlphaBlendFunction = vk_to_intel_blend_op[a->alpha_blend_op], + }; + + if (a->src_color_blend_factor != a->src_alpha_blend_factor || + a->dst_color_blend_factor != a->dst_alpha_blend_factor || + a->color_blend_op != a->alpha_blend_op) { +#if GFX_VER >= 8 + blend_state.IndependentAlphaBlendEnable = true; +#else + entry.IndependentAlphaBlendEnable = true; +#endif + } + + /* The Dual Source Blending documentation says: + * + * "If SRC1 is included in a src/dst blend factor and + * a DualSource RT Write message is not used, results + * are UNDEFINED. (This reflects the same restriction in DX APIs, + * where undefined results are produced if “o1” is not written + * by a PS – there are no default values defined)." + * + * There is no way to gracefully fix this undefined situation + * so we just disable the blending to prevent possible issues. + */ + if (!wm_prog_data->dual_src_blend && + (is_dual_src_blend_factor(a->src_color_blend_factor) || + is_dual_src_blend_factor(a->dst_color_blend_factor) || + is_dual_src_blend_factor(a->src_alpha_blend_factor) || + is_dual_src_blend_factor(a->dst_alpha_blend_factor))) { + vk_logw(VK_LOG_OBJS(&device->vk.base), + "Enabled dual-src blend factors without writing both targets " + "in the shader. Disabling blending to avoid GPU hangs."); + entry.ColorBufferBlendEnable = false; + } + + /* Our hardware applies the blend factor prior to the blend function + * regardless of what function is used. Technically, this means the + * hardware can do MORE than GL or Vulkan specify. However, it also + * means that, for MIN and MAX, we have to stomp the blend factor to + * ONE to make it a no-op. + */ + if (a->color_blend_op == VK_BLEND_OP_MIN || + a->color_blend_op == VK_BLEND_OP_MAX) { + entry.SourceBlendFactor = BLENDFACTOR_ONE; + entry.DestinationBlendFactor = BLENDFACTOR_ONE; + } + if (a->alpha_blend_op == VK_BLEND_OP_MIN || + a->alpha_blend_op == VK_BLEND_OP_MAX) { + entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE; + entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE; + } + GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry); + state_pos += GENX(BLEND_STATE_ENTRY_length); +#if GFX_VER >= 8 + if (i == 0) + bs0 = entry; +#endif + } + +#if GFX_VER >= 8 + struct GENX(3DSTATE_PS_BLEND) blend = { + GENX(3DSTATE_PS_BLEND_header), + }; + blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable; + blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable; + blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor; + blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor; + blend.SourceBlendFactor = bs0.SourceBlendFactor; + blend.DestinationBlendFactor = bs0.DestinationBlendFactor; + blend.AlphaTestEnable = false; + blend.IndependentAlphaBlendEnable = blend_state.IndependentAlphaBlendEnable; + + GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend); +#endif + + GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state); +} + +static void +emit_3dstate_clip(struct anv_graphics_pipeline *pipeline, + const struct vk_input_assembly_state *ia, + const struct vk_viewport_state *vp, + const struct vk_rasterization_state *rs) +{ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + (void) wm_prog_data; + + struct GENX(3DSTATE_CLIP) clip = { + GENX(3DSTATE_CLIP_header), + }; + + clip.ClipEnable = true; + clip.StatisticsEnable = true; + clip.EarlyCullEnable = true; + clip.APIMode = pipeline->negative_one_to_one ? APIMODE_OGL : APIMODE_D3D; + clip.GuardbandClipTestEnable = true; + +#if GFX_VER >= 8 + clip.VertexSubPixelPrecisionSelect = _8Bit; +#endif + clip.ClipMode = CLIPMODE_NORMAL; + + switch (rs->provoking_vertex) { + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: + clip.TriangleStripListProvokingVertexSelect = 0; + clip.LineStripListProvokingVertexSelect = 0; + clip.TriangleFanProvokingVertexSelect = 1; + break; + + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: + clip.TriangleStripListProvokingVertexSelect = 2; + clip.LineStripListProvokingVertexSelect = 1; + clip.TriangleFanProvokingVertexSelect = 2; + break; + + default: + unreachable("Invalid provoking vertex mode"); + } + + clip.MinimumPointWidth = 0.125; + clip.MaximumPointWidth = 255.875; + + /* TODO(mesh): Multiview. */ + if (anv_pipeline_is_primitive(pipeline)) { + const struct brw_vue_prog_data *last = + anv_pipeline_get_last_vue_prog_data(pipeline); + + /* From the Vulkan 1.0.45 spec: + * + * "If the last active vertex processing stage shader entry point's + * interface does not include a variable decorated with + * ViewportIndex, then the first viewport is used." + */ + if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) { + clip.MaximumVPIndex = vp->viewport_count > 0 ? + vp->viewport_count - 1 : 0; + } else { + clip.MaximumVPIndex = 0; + } + + /* From the Vulkan 1.0.45 spec: + * + * "If the last active vertex processing stage shader entry point's + * interface does not include a variable decorated with Layer, then + * the first layer is used." + */ + clip.ForceZeroRTAIndexEnable = + !(last->vue_map.slots_valid & VARYING_BIT_LAYER); + +#if GFX_VER == 7 + clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask; + clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask; +#endif + } else if (anv_pipeline_is_mesh(pipeline)) { + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + if (vp && vp->viewport_count > 0 && + mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) { + clip.MaximumVPIndex = vp->viewport_count - 1; + } + } + +#if GFX_VER == 7 + clip.FrontWinding = genX(vk_to_intel_front_face)[rs->front_face]; + clip.CullMode = genX(vk_to_intel_cullmode)[rs->cull_mode]; + clip.ViewportZClipTestEnable = pipeline->depth_clip_enable; +#else + clip.NonPerspectiveBarycentricEnable = wm_prog_data ? + wm_prog_data->uses_nonperspective_interp_modes : 0; +#endif + + GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip); + +#if GFX_VERx10 >= 125 + if (anv_pipeline_is_mesh(pipeline)) { + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) { + clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0; + clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask; + clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask; + } + } +#endif +} + +static void +emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline, + const struct vk_rasterization_state *rs) +{ + const struct brw_vue_prog_data *prog_data = + anv_pipeline_get_last_vue_prog_data(pipeline); + const struct brw_vue_map *vue_map = &prog_data->vue_map; + + nir_xfb_info *xfb_info; + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) + xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info; + else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) + xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info; + else + xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info; + + if (xfb_info) { + struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128]; + int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0}; + int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0}; + + memset(so_decl, 0, sizeof(so_decl)); + + for (unsigned i = 0; i < xfb_info->output_count; i++) { + const nir_xfb_output_info *output = &xfb_info->outputs[i]; + unsigned buffer = output->buffer; + unsigned stream = xfb_info->buffer_to_stream[buffer]; + + /* Our hardware is unusual in that it requires us to program SO_DECLs + * for fake "hole" components, rather than simply taking the offset + * for each real varying. Each hole can have size 1, 2, 3, or 4; we + * program as many size = 4 holes as we can, then a final hole to + * accommodate the final 1, 2, or 3 remaining. + */ + int hole_dwords = (output->offset - next_offset[buffer]) / 4; + while (hole_dwords > 0) { + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .HoleFlag = 1, + .OutputBufferSlot = buffer, + .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1, + }; + hole_dwords -= 4; + } + + int varying = output->location; + uint8_t component_mask = output->component_mask; + /* VARYING_SLOT_PSIZ contains four scalar fields packed together: + * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x + * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y + * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z + * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w + */ + if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 0; // SO_DECL_COMPMASK_X + } else if (varying == VARYING_SLOT_LAYER) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 1; // SO_DECL_COMPMASK_Y + } else if (varying == VARYING_SLOT_VIEWPORT) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 2; // SO_DECL_COMPMASK_Z + } else if (varying == VARYING_SLOT_PSIZ) { + component_mask = 1 << 3; // SO_DECL_COMPMASK_W + } + + next_offset[buffer] = output->offset + + __builtin_popcount(component_mask) * 4; + + const int slot = vue_map->varying_to_slot[varying]; + if (slot < 0) { + /* This can happen if the shader never writes to the varying. + * Insert a hole instead of actual varying data. + */ + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .HoleFlag = true, + .OutputBufferSlot = buffer, + .ComponentMask = component_mask, + }; + } else { + so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { + .OutputBufferSlot = buffer, + .RegisterIndex = slot, + .ComponentMask = component_mask, + }; + } + } + + int max_decls = 0; + for (unsigned s = 0; s < MAX_XFB_STREAMS; s++) + max_decls = MAX2(max_decls, decls[s]); + + uint8_t sbs[MAX_XFB_STREAMS] = { }; + for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) { + if (xfb_info->buffers_written & (1 << b)) + sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; + } + + /* Wa_16011773973: + * If SOL is enabled and SO_DECL state has to be programmed, + * 1. Send 3D State SOL state with SOL disabled + * 2. Send SO_DECL NP state + * 3. Send 3D State SOL with SOL Enabled + */ + if (intel_device_info_is_dg2(pipeline->base.device->info)) + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so); + + uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls, + GENX(3DSTATE_SO_DECL_LIST), + .StreamtoBufferSelects0 = sbs[0], + .StreamtoBufferSelects1 = sbs[1], + .StreamtoBufferSelects2 = sbs[2], + .StreamtoBufferSelects3 = sbs[3], + .NumEntries0 = decls[0], + .NumEntries1 = decls[1], + .NumEntries2 = decls[2], + .NumEntries3 = decls[3]); + + for (int i = 0; i < max_decls; i++) { + GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, + &(struct GENX(SO_DECL_ENTRY)) { + .Stream0Decl = so_decl[0][i], + .Stream1Decl = so_decl[1][i], + .Stream2Decl = so_decl[2][i], + .Stream3Decl = so_decl[3][i], + }); + } + } + +#if GFX_VER == 7 +# define streamout_state_dw pipeline->gfx7.streamout_state +#else +# define streamout_state_dw pipeline->gfx8.streamout_state +#endif + + struct GENX(3DSTATE_STREAMOUT) so = { + GENX(3DSTATE_STREAMOUT_header), + }; + + if (xfb_info) { + so.SOFunctionEnable = true; + so.SOStatisticsEnable = true; + + switch (rs->provoking_vertex) { + case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT: + so.ReorderMode = LEADING; + break; + + case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT: + so.ReorderMode = TRAILING; + break; + + default: + unreachable("Invalid provoking vertex mode"); + } + + so.RenderStreamSelect = rs->rasterization_stream; + +#if GFX_VER >= 8 + so.Buffer0SurfacePitch = xfb_info->buffers[0].stride; + so.Buffer1SurfacePitch = xfb_info->buffers[1].stride; + so.Buffer2SurfacePitch = xfb_info->buffers[2].stride; + so.Buffer3SurfacePitch = xfb_info->buffers[3].stride; +#else + pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride; + pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride; + pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride; + pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride; + + /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which + * is a bit inconvenient because we don't know what buffers will + * actually be enabled until draw time. We do our best here by + * setting them based on buffers_written and we disable them + * as-needed at draw time by setting EndAddress = BaseAddress. + */ + so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0); + so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1); + so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2); + so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3); +#endif + + int urb_entry_read_offset = 0; + int urb_entry_read_length = + (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; + + /* We always read the whole vertex. This could be reduced at some + * point by reading less and offsetting the register index in the + * SO_DECLs. + */ + so.Stream0VertexReadOffset = urb_entry_read_offset; + so.Stream0VertexReadLength = urb_entry_read_length - 1; + so.Stream1VertexReadOffset = urb_entry_read_offset; + so.Stream1VertexReadLength = urb_entry_read_length - 1; + so.Stream2VertexReadOffset = urb_entry_read_offset; + so.Stream2VertexReadLength = urb_entry_read_length - 1; + so.Stream3VertexReadOffset = urb_entry_read_offset; + so.Stream3VertexReadLength = urb_entry_read_length - 1; + } + + GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so); +} + +static uint32_t +get_sampler_count(const struct anv_shader_bin *bin) +{ + uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4); + + /* We can potentially have way more than 32 samplers and that's ok. + * However, the 3DSTATE_XS packets only have 3 bits to specify how + * many to pre-fetch and all values above 4 are marked reserved. + */ + return MIN2(count_by_4, 4); +} + +static UNUSED struct anv_address +get_scratch_address(struct anv_pipeline *pipeline, + gl_shader_stage stage, + const struct anv_shader_bin *bin) +{ + return (struct anv_address) { + .bo = anv_scratch_pool_alloc(pipeline->device, + &pipeline->device->scratch_pool, + stage, bin->prog_data->total_scratch), + .offset = 0, + }; +} + +static UNUSED uint32_t +get_scratch_space(const struct anv_shader_bin *bin) +{ + return ffs(bin->prog_data->total_scratch / 2048); +} + +static UNUSED uint32_t +get_scratch_surf(struct anv_pipeline *pipeline, + gl_shader_stage stage, + const struct anv_shader_bin *bin) +{ + if (bin->prog_data->total_scratch == 0) + return 0; + + struct anv_bo *bo = + anv_scratch_pool_alloc(pipeline->device, + &pipeline->device->scratch_pool, + stage, bin->prog_data->total_scratch); + anv_reloc_list_add_bo(pipeline->batch.relocs, + pipeline->batch.alloc, bo); + return anv_scratch_pool_get_surf(pipeline->device, + &pipeline->device->scratch_pool, + bin->prog_data->total_scratch) >> 4; +} + +static void +emit_3dstate_vs(struct anv_graphics_pipeline *pipeline) +{ + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); + const struct anv_shader_bin *vs_bin = + pipeline->shaders[MESA_SHADER_VERTEX]; + + assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX)); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) { + vs.Enable = true; + vs.StatisticsEnable = true; + vs.KernelStartPointer = vs_bin->kernel.offset; +#if GFX_VER >= 8 + vs.SIMD8DispatchEnable = + vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8; +#endif + + assert(!vs_prog_data->base.base.use_alt_mode); +#if GFX_VER < 11 + vs.SingleVertexDispatch = false; +#endif + vs.VectorMaskEnable = false; + /* Wa_1606682166: + * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. + * Disable the Sampler state prefetch functionality in the SARB by + * programming 0xB000[30] to '1'. + */ + vs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin); + vs.BindingTableEntryCount = vs_bin->bind_map.surface_count; + vs.FloatingPointMode = IEEE754; + vs.IllegalOpcodeExceptionEnable = false; + vs.SoftwareExceptionEnable = false; + vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; + + if (GFX_VER == 9 && devinfo->gt == 4 && + anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + /* On Sky Lake GT4, we have experienced some hangs related to the VS + * cache and tessellation. It is unknown exactly what is happening + * but the Haswell docs for the "VS Reference Count Full Force Miss + * Enable" field of the "Thread Mode" register refer to a HSW bug in + * which the VUE handle reference count would overflow resulting in + * internal reference counting bugs. My (Jason's) best guess is that + * this bug cropped back up on SKL GT4 when we suddenly had more + * threads in play than any previous gfx9 hardware. + * + * What we do know for sure is that setting this bit when + * tessellation shaders are in use fixes a GPU hang in Batman: Arkham + * City when playing with DXVK (https://bugs.freedesktop.org/107280). + * Disabling the vertex cache with tessellation shaders should only + * have a minor performance impact as the tessellation shaders are + * likely generating and processing far more geometry than the vertex + * stage. + */ + vs.VertexCacheDisable = true; + } + + vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; + vs.VertexURBEntryReadOffset = 0; + vs.DispatchGRFStartRegisterForURBData = + vs_prog_data->base.base.dispatch_grf_start_reg; + +#if GFX_VER >= 8 + vs.UserClipDistanceClipTestEnableBitmask = + vs_prog_data->base.clip_distance_mask; + vs.UserClipDistanceCullTestEnableBitmask = + vs_prog_data->base.cull_distance_mask; +#endif + +#if GFX_VERx10 >= 125 + vs.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); +#else + vs.PerThreadScratchSpace = get_scratch_space(vs_bin); + vs.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin); +#endif + } +} + +static void +emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline, + const struct vk_tessellation_state *ts) +{ + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs); + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te); + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds); + return; + } + + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct anv_shader_bin *tcs_bin = + pipeline->shaders[MESA_SHADER_TESS_CTRL]; + const struct anv_shader_bin *tes_bin = + pipeline->shaders[MESA_SHADER_TESS_EVAL]; + + const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline); + const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) { + hs.Enable = true; + hs.StatisticsEnable = true; + hs.KernelStartPointer = tcs_bin->kernel.offset; + /* Wa_1606682166 */ + hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin); + hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count; + +#if GFX_VER >= 12 + /* Wa_1604578095: + * + * Hang occurs when the number of max threads is less than 2 times + * the number of instance count. The number of max threads must be + * more than 2 times the number of instance count. + */ + assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances); +#endif + + hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; + hs.IncludeVertexHandles = true; + hs.InstanceCount = tcs_prog_data->instances - 1; + + hs.VertexURBEntryReadLength = 0; + hs.VertexURBEntryReadOffset = 0; + hs.DispatchGRFStartRegisterForURBData = + tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f; +#if GFX_VER >= 12 + hs.DispatchGRFStartRegisterForURBData5 = + tcs_prog_data->base.base.dispatch_grf_start_reg >> 5; +#endif + +#if GFX_VERx10 >= 125 + hs.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); +#else + hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); + hs.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin); +#endif + +#if GFX_VER == 12 + /* Patch Count threshold specifies the maximum number of patches that + * will be accumulated before a thread dispatch is forced. + */ + hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold; +#endif + +#if GFX_VER >= 9 + hs.DispatchMode = tcs_prog_data->base.dispatch_mode; + hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; +#endif + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) { + te.Partitioning = tes_prog_data->partitioning; + + if (ts->domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { + te.OutputTopology = tes_prog_data->output_topology; + } else { + /* When the origin is upper-left, we have to flip the winding order */ + if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) { + te.OutputTopology = OUTPUT_TRI_CW; + } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) { + te.OutputTopology = OUTPUT_TRI_CCW; + } else { + te.OutputTopology = tes_prog_data->output_topology; + } + } + + te.TEDomain = tes_prog_data->domain; + te.TEEnable = true; + te.MaximumTessellationFactorOdd = 63.0; + te.MaximumTessellationFactorNotOdd = 64.0; +#if GFX_VERx10 >= 125 + te.TessellationDistributionMode = TEDMODE_RR_FREE; + te.TessellationDistributionLevel = TEDLEVEL_PATCH; + /* 64_TRIANGLES */ + te.SmallPatchThreshold = 3; + /* 1K_TRIANGLES */ + te.TargetBlockSize = 8; + /* 1K_TRIANGLES */ + te.LocalBOPAccumulatorThreshold = 1; +#endif + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) { + ds.Enable = true; + ds.StatisticsEnable = true; + ds.KernelStartPointer = tes_bin->kernel.offset; + /* Wa_1606682166 */ + ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin); + ds.BindingTableEntryCount = tes_bin->bind_map.surface_count; + ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; + + ds.ComputeWCoordinateEnable = + tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; + + ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; + ds.PatchURBEntryReadOffset = 0; + ds.DispatchGRFStartRegisterForURBData = + tes_prog_data->base.base.dispatch_grf_start_reg; + +#if GFX_VER >= 8 +#if GFX_VER < 11 + ds.DispatchMode = + tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? + DISPATCH_MODE_SIMD8_SINGLE_PATCH : + DISPATCH_MODE_SIMD4X2; +#else + assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); + ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; +#endif + + ds.UserClipDistanceClipTestEnableBitmask = + tes_prog_data->base.clip_distance_mask; + ds.UserClipDistanceCullTestEnableBitmask = + tes_prog_data->base.cull_distance_mask; +#endif + +#if GFX_VER >= 12 + ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id; +#endif +#if GFX_VERx10 >= 125 + ds.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); +#else + ds.PerThreadScratchSpace = get_scratch_space(tes_bin); + ds.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin); +#endif + } +} + +static void +emit_3dstate_gs(struct anv_graphics_pipeline *pipeline) +{ + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct anv_shader_bin *gs_bin = + pipeline->shaders[MESA_SHADER_GEOMETRY]; + + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs); + return; + } + + const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) { + gs.Enable = true; + gs.StatisticsEnable = true; + gs.KernelStartPointer = gs_bin->kernel.offset; + gs.DispatchMode = gs_prog_data->base.dispatch_mode; + + gs.SingleProgramFlow = false; + gs.VectorMaskEnable = false; + /* Wa_1606682166 */ + gs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin); + gs.BindingTableEntryCount = gs_bin->bind_map.surface_count; + gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; + gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; + + if (GFX_VER == 8) { + /* Broadwell is weird. It needs us to divide by 2. */ + gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1; + } else { + gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1; + } + + gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; + gs.OutputTopology = gs_prog_data->output_topology; + gs.ControlDataFormat = gs_prog_data->control_data_format; + gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords; + gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1; + gs.ReorderMode = TRAILING; + +#if GFX_VER >= 8 + gs.ExpectedVertexCount = gs_prog_data->vertices_in; + gs.StaticOutput = gs_prog_data->static_vertex_count >= 0; + gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ? + gs_prog_data->static_vertex_count : 0; +#endif + + gs.VertexURBEntryReadOffset = 0; + gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length; + gs.DispatchGRFStartRegisterForURBData = + gs_prog_data->base.base.dispatch_grf_start_reg; + +#if GFX_VER >= 8 + gs.UserClipDistanceClipTestEnableBitmask = + gs_prog_data->base.clip_distance_mask; + gs.UserClipDistanceCullTestEnableBitmask = + gs_prog_data->base.cull_distance_mask; +#endif + +#if GFX_VERx10 >= 125 + gs.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); +#else + gs.PerThreadScratchSpace = get_scratch_space(gs_bin); + gs.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin); +#endif + } +} + +static void +emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, + const struct vk_input_assembly_state *ia, + const struct vk_rasterization_state *rs, + const struct vk_multisample_state *ms, + const struct vk_color_blend_state *cb, + const struct vk_render_pass_state *rp) +{ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + struct GENX(3DSTATE_WM) wm = { + GENX(3DSTATE_WM_header), + }; + wm.StatisticsEnable = true; + wm.LineEndCapAntialiasingRegionWidth = _05pixels; + wm.LineAntialiasingRegionWidth = _10pixels; + wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; + + if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + if (wm_prog_data->early_fragment_tests) { + wm.EarlyDepthStencilControl = EDSC_PREPS; + } else if (wm_prog_data->has_side_effects) { + wm.EarlyDepthStencilControl = EDSC_PSEXEC; + } else { + wm.EarlyDepthStencilControl = EDSC_NORMAL; + } + +#if GFX_VER >= 8 + /* Gen8 hardware tries to compute ThreadDispatchEnable for us but + * doesn't take into account KillPixels when no depth or stencil + * writes are enabled. In order for occlusion queries to work + * correctly with no attachments, we need to force-enable PS thread + * dispatch. + * + * The BDW docs are pretty clear that that this bit isn't validated + * and probably shouldn't be used in production: + * + * "This must always be set to Normal. This field should not be + * tested for functional validation." + * + * Unfortunately, however, the other mechanism we have for doing this + * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW. + * Given two bad options, we choose the one which works. + */ + pipeline->force_fragment_thread_dispatch = + wm_prog_data->has_side_effects || + wm_prog_data->uses_kill; +#endif + + wm.BarycentricInterpolationMode = + wm_prog_data->barycentric_interp_modes; + +#if GFX_VER < 8 + wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; + wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; + wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; + wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; + + /* If the subpass has a depth or stencil self-dependency, then we + * need to force the hardware to do the depth/stencil write *after* + * fragment shader execution. Otherwise, the writes may hit memory + * before we get around to fetching from the input attachment and we + * may get the depth or stencil value from the current draw rather + * than the previous one. + */ + wm.PixelShaderKillsPixel = rp->depth_self_dependency || + rp->stencil_self_dependency || + wm_prog_data->uses_kill; + + pipeline->force_fragment_thread_dispatch = + wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF || + wm_prog_data->has_side_effects || + wm.PixelShaderKillsPixel; + + if (ms != NULL && ms->rasterization_samples > 1) { + if (wm_prog_data->persample_dispatch) { + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + } else { + wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; + } + } else { + wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; + } +#endif + + wm.LineStippleEnable = rs->line.stipple.enable; + } + + const struct intel_device_info *devinfo = pipeline->base.device->info; + uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm; + GENX(3DSTATE_WM_pack)(NULL, dws, &wm); +} + +static void +emit_3dstate_ps(struct anv_graphics_pipeline *pipeline, + const struct vk_multisample_state *ms, + const struct vk_color_blend_state *cb) +{ + UNUSED const struct intel_device_info *devinfo = + pipeline->base.device->info; + const struct anv_shader_bin *fs_bin = + pipeline->shaders[MESA_SHADER_FRAGMENT]; + + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) { +#if GFX_VER == 7 + /* Even if no fragments are ever dispatched, gfx7 hardware hangs if + * we don't at least set the maximum number of threads. + */ + ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; +#endif + } + return; + } + + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + +#if GFX_VER < 8 + /* The hardware wedges if you have this bit set but don't turn on any dual + * source blend factors. + */ + bool dual_src_blend = false; + if (wm_prog_data->dual_src_blend && cb) { + for (uint32_t i = 0; i < cb->attachment_count; i++) { + const struct vk_color_blend_attachment_state *a = + &cb->attachments[i]; + + if (a->blend_enable && + (is_dual_src_blend_factor(a->src_color_blend_factor) || + is_dual_src_blend_factor(a->dst_color_blend_factor) || + is_dual_src_blend_factor(a->src_alpha_blend_factor) || + is_dual_src_blend_factor(a->dst_alpha_blend_factor))) { + dual_src_blend = true; + break; + } + } + } +#endif + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) { + ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; + ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; + ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; + + /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: + * + * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 + * Dispatch must not be enabled for PER_PIXEL dispatch mode." + * + * Since 16x MSAA is first introduced on SKL, we don't need to apply + * the workaround on any older hardware. + */ + if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch && + ms != NULL && ms->rasterization_samples == 16) { + assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); + ps._32PixelDispatchEnable = false; + } + + ps.KernelStartPointer0 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); + ps.KernelStartPointer1 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); + ps.KernelStartPointer2 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); + + ps.SingleProgramFlow = false; + ps.VectorMaskEnable = GFX_VER >= 8 && + wm_prog_data->uses_vmask; + /* Wa_1606682166 */ + ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin); + ps.BindingTableEntryCount = fs_bin->bind_map.surface_count; + ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || + wm_prog_data->base.ubo_ranges[0].length; + ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ? + POSOFFSET_SAMPLE: POSOFFSET_NONE; +#if GFX_VER < 8 + ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; + ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; + ps.DualSourceBlendEnable = dual_src_blend; +#endif + +#if GFX_VERx10 == 75 + /* Haswell requires the sample mask to be set in this packet as well + * as in 3DSTATE_SAMPLE_MASK; the values should match. + */ + ps.SampleMask = 0xff; +#endif + +#if GFX_VER >= 8 + ps.MaximumNumberofThreadsPerPSD = + devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1); +#else + ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; +#endif + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); + +#if GFX_VERx10 >= 125 + ps.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); +#else + ps.PerThreadScratchSpace = get_scratch_space(fs_bin); + ps.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin); +#endif + } +} + +#if GFX_VER >= 8 +static void +emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline, + const struct vk_rasterization_state *rs, + const struct vk_render_pass_state *rp) +{ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps); + return; + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) { + ps.PixelShaderValid = true; + ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; + ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; + ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch; + ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; + ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; + ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; + + /* If the subpass has a depth or stencil self-dependency, then we need + * to force the hardware to do the depth/stencil write *after* fragment + * shader execution. Otherwise, the writes may hit memory before we get + * around to fetching from the input attachment and we may get the depth + * or stencil value from the current draw rather than the previous one. + */ + ps.PixelShaderKillsPixel = rp->depth_self_dependency || + rp->stencil_self_dependency || + wm_prog_data->uses_kill; + +#if GFX_VER >= 9 + ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil; + ps.PixelShaderPullsBary = wm_prog_data->pulls_bary; + + ps.InputCoverageMaskState = ICMS_NONE; + assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */ + if (!wm_prog_data->uses_sample_mask) + ps.InputCoverageMaskState = ICMS_NONE; + else if (wm_prog_data->per_coarse_pixel_dispatch) + ps.InputCoverageMaskState = ICMS_NORMAL; + else if (wm_prog_data->post_depth_coverage) + ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; + else + ps.InputCoverageMaskState = ICMS_NORMAL; +#else + ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; +#endif + +#if GFX_VER >= 11 + ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients = + wm_prog_data->uses_depth_w_coefficients; + ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch; +#endif +#if GFX_VERx10 >= 125 + /* TODO: We should only require this when the last geometry shader uses + * a fragment shading rate that is not constant. + */ + ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch; +#endif + } +} +#endif + +static void +emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline) +{ + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) { + vfs.StatisticsEnable = true; + } +} + +static void +compute_kill_pixel(struct anv_graphics_pipeline *pipeline, + const struct vk_multisample_state *ms, + const struct vk_render_pass_state *rp) +{ + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { + pipeline->kill_pixel = false; + return; + } + + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + + /* This computes the KillPixel portion of the computation for whether or + * not we want to enable the PMA fix on gfx8 or gfx9. It's given by this + * chunk of the giant formula: + * + * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) + * + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is + * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept + * of an alpha test. + */ + pipeline->kill_pixel = + rp->depth_self_dependency || + rp->stencil_self_dependency || + wm_prog_data->uses_kill || + wm_prog_data->uses_omask || + (ms && ms->alpha_to_coverage_enable); +} + +#if GFX_VER == 12 +static void +emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline, + const struct vk_render_pass_state *rp) +{ + const int replication_count = + anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots; + + assert(replication_count >= 1); + if (replication_count == 1) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); + return; + } + + uint32_t view_mask = rp->view_mask; + assert(replication_count == util_bitcount(view_mask)); + assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { + pr.ReplicaMask = (1 << replication_count) - 1; + pr.ReplicationCount = replication_count - 1; + + int i = 0; + u_foreach_bit(view_index, rp->view_mask) { + pr.RTAIOffset[i] = view_index; + i++; + } + } +} +#endif + +#if GFX_VERx10 >= 125 +static void +emit_task_state(struct anv_graphics_pipeline *pipeline) +{ + assert(anv_pipeline_is_mesh(pipeline)); + + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero); + return; + } + + const struct anv_shader_bin *task_bin = pipeline->shaders[MESA_SHADER_TASK]; + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), tc) { + tc.TaskShaderEnable = true; + tc.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_TASK, task_bin); + } + + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline); + const struct brw_cs_dispatch_info task_dispatch = + brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL); + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_SHADER), task) { + task.KernelStartPointer = task_bin->kernel.offset; + task.SIMDSize = task_dispatch.simd_size / 16; + task.MessageSIMD = task.SIMDSize; + task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads; + task.ExecutionMask = task_dispatch.right_mask; + task.LocalXMaximum = task_dispatch.group_size - 1; + task.EmitLocalIDX = true; + + task.NumberofBarriers = task_prog_data->base.uses_barrier; + task.SharedLocalMemorySize = + encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared); + + /* + * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address + * of a buffer with push constants and descriptor set table and + * InlineData[2:7] will be used for first few push constants. + */ + task.EmitInlineParameter = true; + + task.XP0Required = task_prog_data->uses_drawid; + } + + /* Recommended values from "Task and Mesh Distribution Programming". */ + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) { + redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1; + redistrib.SmallTaskThreshold = 1; /* 2^N */ + redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */ + redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM; + + /* TODO: We have an unknown issue with Task Payload when task redistribution + * is enabled. Disable it for now. + * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/7141 + */ + redistrib.TaskRedistributionMode = TASKREDISTRIB_OFF; + } +} + +static void +emit_mesh_state(struct anv_graphics_pipeline *pipeline) +{ + assert(anv_pipeline_is_mesh(pipeline)); + + const struct anv_shader_bin *mesh_bin = pipeline->shaders[MESA_SHADER_MESH]; + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), mc) { + mc.MeshShaderEnable = true; + mc.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_MESH, mesh_bin); + + /* TODO(mesh): MaximumNumberofThreadGroups. */ + } + + const struct intel_device_info *devinfo = pipeline->base.device->info; + const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline); + const struct brw_cs_dispatch_info mesh_dispatch = + brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL); + + const unsigned output_topology = + mesh_prog_data->primitive_type == SHADER_PRIM_POINTS ? OUTPUT_POINT : + mesh_prog_data->primitive_type == SHADER_PRIM_LINES ? OUTPUT_LINE : + OUTPUT_TRI; + + uint32_t index_format; + switch (mesh_prog_data->index_format) { + case BRW_INDEX_FORMAT_U32: + index_format = INDEX_U32; + break; + default: + unreachable("invalid index format"); + } + + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_SHADER), mesh) { + mesh.KernelStartPointer = mesh_bin->kernel.offset; + mesh.SIMDSize = mesh_dispatch.simd_size / 16; + mesh.MessageSIMD = mesh.SIMDSize; + mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads; + mesh.ExecutionMask = mesh_dispatch.right_mask; + mesh.LocalXMaximum = mesh_dispatch.group_size - 1; + mesh.EmitLocalIDX = true; + + mesh.MaximumPrimitiveCount = mesh_prog_data->map.max_primitives - 1; + mesh.OutputTopology = output_topology; + mesh.PerVertexDataPitch = mesh_prog_data->map.per_vertex_pitch_dw / 8; + mesh.PerPrimitiveDataPresent = mesh_prog_data->map.per_primitive_pitch_dw > 0; + mesh.PerPrimitiveDataPitch = mesh_prog_data->map.per_primitive_pitch_dw / 8; + mesh.IndexFormat = index_format; + + mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier; + mesh.SharedLocalMemorySize = + encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared); + + /* + * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address + * of a buffer with push constants and descriptor set table and + * InlineData[2:7] will be used for first few push constants. + */ + mesh.EmitInlineParameter = true; + + mesh.XP0Required = mesh_prog_data->uses_drawid; + } + + /* Recommended values from "Task and Mesh Distribution Programming". */ + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_DISTRIB), distrib) { + distrib.DistributionMode = MESH_RR_FREE; + distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 8 : 9; /* 2^N thread groups */ + distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 5 : 3; /* 2^N thread groups */ + } +} +#endif + +void +genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline, + const struct vk_graphics_pipeline_state *state) +{ + enum intel_urb_deref_block_size urb_deref_block_size; + emit_urb_setup(pipeline, &urb_deref_block_size); + + assert(state->rs != NULL); + emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp, + urb_deref_block_size); + emit_ms_state(pipeline, state->ms); + emit_cb_state(pipeline, state->cb, state->ms); + compute_kill_pixel(pipeline, state->ms, state->rp); + + emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs); + +#if GFX_VER == 12 + emit_3dstate_primitive_replication(pipeline, state->rp); +#endif + +#if 0 + /* From gfx7_vs_state.c */ + + /** + * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > + * Geometry > Geometry Shader > State: + * + * "Note: Because of corruption in IVB:GT2, software needs to flush the + * whole fixed function pipeline when the GS enable changes value in + * the 3DSTATE_GS." + * + * The hardware architects have clarified that in this context "flush the + * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS + * Stall" bit set. + */ + if (device->info->platform == INTEL_PLATFORM_IVB) + gfx7_emit_vs_workaround_flush(brw); +#endif + + if (anv_pipeline_is_primitive(pipeline)) { + emit_vertex_input(pipeline, state->vi); + + emit_3dstate_vs(pipeline); + emit_3dstate_hs_te_ds(pipeline, state->ts); + emit_3dstate_gs(pipeline); + + emit_3dstate_vf_statistics(pipeline); + + emit_3dstate_streamout(pipeline, state->rs); + +#if GFX_VERx10 >= 125 + const struct anv_device *device = pipeline->base.device; + /* Disable Mesh. */ + if (device->physical->vk.supported_extensions.NV_mesh_shader) { + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), zero); + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero); + } +#endif + } else { + assert(anv_pipeline_is_mesh(pipeline)); + + /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable + * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1. + */ + anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so) {} + +#if GFX_VERx10 >= 125 + emit_task_state(pipeline); + emit_mesh_state(pipeline); +#endif + } + + emit_3dstate_sbe(pipeline); + emit_3dstate_wm(pipeline, state->ia, state->rs, + state->ms, state->cb, state->rp); + emit_3dstate_ps(pipeline, state->ms, state->cb); +#if GFX_VER >= 8 + emit_3dstate_ps_extra(pipeline, state->rs, state->rp); +#endif +} + +#if GFX_VERx10 >= 125 + +void +genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) +{ + struct anv_device *device = pipeline->base.device; + const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); + anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); + + const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs; + const struct intel_device_info *devinfo = device->info; + + anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) { + cfe.MaximumNumberofThreads = + devinfo->max_cs_threads * devinfo->subslice_total; + cfe.ScratchSpaceBuffer = + get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); + } +} + +#else /* #if GFX_VERx10 >= 125 */ + +void +genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) +{ + struct anv_device *device = pipeline->base.device; + const struct intel_device_info *devinfo = device->info; + const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); + + anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0); + + const struct brw_cs_dispatch_info dispatch = + brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); + const uint32_t vfe_curbe_allocation = + ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + + cs_prog_data->push.cross_thread.regs, 2); + + const struct anv_shader_bin *cs_bin = pipeline->cs; + + anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) { +#if GFX_VER > 7 + vfe.StackSize = 0; +#else + vfe.GPGPUMode = true; +#endif + vfe.MaximumNumberofThreads = + devinfo->max_cs_threads * devinfo->subslice_total - 1; + vfe.NumberofURBEntries = GFX_VER <= 7 ? 0 : 2; +#if GFX_VER < 11 + vfe.ResetGatewayTimer = true; +#endif +#if GFX_VER <= 8 + vfe.BypassGatewayControl = true; +#endif + vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2; + vfe.CURBEAllocationSize = vfe_curbe_allocation; + + if (cs_bin->prog_data->total_scratch) { + if (GFX_VER >= 8) { + /* Broadwell's Per Thread Scratch Space is in the range [0, 11] + * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. + */ + vfe.PerThreadScratchSpace = + ffs(cs_bin->prog_data->total_scratch) - 11; + } else if (GFX_VERx10 == 75) { + /* Haswell's Per Thread Scratch Space is in the range [0, 10] + * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. + */ + vfe.PerThreadScratchSpace = + ffs(cs_bin->prog_data->total_scratch) - 12; + } else { + /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB] + * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. + */ + vfe.PerThreadScratchSpace = + cs_bin->prog_data->total_scratch / 1024 - 1; + } + vfe.ScratchSpaceBasePointer = + get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin); + } + } + + struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { + .KernelStartPointer = + cs_bin->kernel.offset + + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size), + + /* Wa_1606682166 */ + .SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin), + /* We add 1 because the CS indirect parameters buffer isn't accounted + * for in bind_map.surface_count. + */ + .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), + .BarrierEnable = cs_prog_data->uses_barrier, + .SharedLocalMemorySize = + encode_slm_size(GFX_VER, cs_prog_data->base.total_shared), + +#if GFX_VERx10 != 75 + .ConstantURBEntryReadOffset = 0, +#endif + .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, +#if GFX_VERx10 >= 75 + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs, +#endif +#if GFX_VER >= 12 + /* TODO: Check if we are missing workarounds and enable mid-thread + * preemption. + * + * We still have issues with mid-thread preemption (it was already + * disabled by the kernel on gfx11, due to missing workarounds). It's + * possible that we are just missing some workarounds, and could enable + * it later, but for now let's disable it to fix a GPU in compute in Car + * Chase (and possibly more). + */ + .ThreadPreemptionDisable = true, +#endif + + .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, + }; + GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, + pipeline->interface_descriptor_data, + &desc); +} + +#endif /* #if GFX_VERx10 >= 125 */ + +#if GFX_VERx10 >= 125 + +void +genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline) +{ + for (uint32_t i = 0; i < pipeline->group_count; i++) { + struct anv_rt_shader_group *group = &pipeline->groups[i]; + + switch (group->type) { + case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: { + struct GFX_RT_GENERAL_SBT_HANDLE sh = {}; + sh.General = anv_shader_bin_get_bsr(group->general, 32); + GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh); + break; + } + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: { + struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {}; + if (group->closest_hit) + sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32); + if (group->any_hit) + sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24); + GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh); + break; + } + + case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: { + struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {}; + if (group->closest_hit) + sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32); + sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24); + GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh); + break; + } + + default: + unreachable("Invalid shader group type"); + } + } +} + +#else + +void +genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline) +{ + unreachable("Ray tracing not supported"); +} + +#endif /* GFX_VERx10 >= 125 */ diff --git a/src/intel/vulkan_hasvk/genX_query.c b/src/intel/vulkan_hasvk/genX_query.c new file mode 100644 index 00000000000..8c20e2cdfe1 --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_query.c @@ -0,0 +1,1530 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" + +#include "util/os_time.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +/* We reserve : + * - GPR 14 for perf queries + * - GPR 15 for conditional rendering + */ +#define MI_BUILDER_NUM_ALLOC_GPRS 14 +#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8 +#define __gen_get_batch_dwords anv_batch_emit_dwords +#define __gen_address_offset anv_address_add +#define __gen_get_batch_address(b, a) anv_batch_address(b, a) +#include "common/mi_builder.h" +#include "perf/intel_perf.h" +#include "perf/intel_perf_mdapi.h" +#include "perf/intel_perf_regs.h" + +#include "vk_util.h" + +static struct anv_address +anv_query_address(struct anv_query_pool *pool, uint32_t query) +{ + return (struct anv_address) { + .bo = pool->bo, + .offset = query * pool->stride, + }; +} + +VkResult genX(CreateQueryPool)( + VkDevice _device, + const VkQueryPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkQueryPool* pQueryPool) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = device->physical; +#if GFX_VER >= 8 + const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; + struct intel_perf_counter_pass *counter_pass; + struct intel_perf_query_info **pass_query; + uint32_t n_passes = 0; +#endif + uint32_t data_offset = 0; + VK_MULTIALLOC(ma); + VkResult result; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); + + /* Query pool slots are made up of some number of 64-bit values packed + * tightly together. For most query types have the first 64-bit value is + * the "available" bit which is 0 when the query is unavailable and 1 when + * it is available. The 64-bit values that follow are determined by the + * type of query. + * + * For performance queries, we have a requirement to align OA reports at + * 64bytes so we put those first and have the "available" bit behind + * together with some other counters. + */ + uint32_t uint64s_per_slot = 0; + + VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1); + + VkQueryPipelineStatisticFlags pipeline_statistics = 0; + switch (pCreateInfo->queryType) { + case VK_QUERY_TYPE_OCCLUSION: + /* Occlusion queries have two values: begin and end. */ + uint64s_per_slot = 1 + 2; + break; + case VK_QUERY_TYPE_TIMESTAMP: + /* Timestamps just have the one timestamp value */ + uint64s_per_slot = 1 + 1; + break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + pipeline_statistics = pCreateInfo->pipelineStatistics; + /* We're going to trust this field implicitly so we need to ensure that + * no unhandled extension bits leak in. + */ + pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; + + /* Statistics queries have a min and max for every statistic */ + uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics); + break; + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + /* Transform feedback queries are 4 values, begin/end for + * written/available. + */ + uint64s_per_slot = 1 + 4; + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + const struct intel_perf_query_field_layout *layout = + &pdevice->perf->query_layout; + + uint64s_per_slot = 2; /* availability + marker */ + /* Align to the requirement of the layout */ + uint64s_per_slot = align_u32(uint64s_per_slot, + DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); + data_offset = uint64s_per_slot * sizeof(uint64_t); + /* Add the query data for begin & end commands */ + uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); + break; + } +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + const struct intel_perf_query_field_layout *layout = + &pdevice->perf->query_layout; + + perf_query_info = vk_find_struct_const(pCreateInfo->pNext, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + n_passes = intel_perf_get_n_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + NULL); + vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass, + perf_query_info->counterIndexCount); + vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *, + n_passes); + uint64s_per_slot = 4 /* availability + small batch */; + /* Align to the requirement of the layout */ + uint64s_per_slot = align_u32(uint64s_per_slot, + DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); + data_offset = uint64s_per_slot * sizeof(uint64_t); + /* Add the query data for begin & end commands */ + uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); + /* Multiply by the number of passes */ + uint64s_per_slot *= n_passes; + break; + } +#endif + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + /* Query has two values: begin and end. */ + uint64s_per_slot = 1 + 2; + break; + default: + assert(!"Invalid query type"); + } + + if (!vk_object_multialloc(&device->vk, &ma, pAllocator, + VK_OBJECT_TYPE_QUERY_POOL)) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + pool->type = pCreateInfo->queryType; + pool->pipeline_statistics = pipeline_statistics; + pool->stride = uint64s_per_slot * sizeof(uint64_t); + pool->slots = pCreateInfo->queryCount; + + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { + pool->data_offset = data_offset; + pool->snapshot_size = (pool->stride - data_offset) / 2; + } +#if GFX_VER >= 8 + else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + pool->pass_size = pool->stride / n_passes; + pool->data_offset = data_offset; + pool->snapshot_size = (pool->pass_size - data_offset) / 2; + pool->n_counters = perf_query_info->counterIndexCount; + pool->counter_pass = counter_pass; + intel_perf_get_counters_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + pool->counter_pass); + pool->n_passes = n_passes; + pool->pass_query = pass_query; + intel_perf_get_n_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + pool->pass_query); + } +#endif + + uint64_t size = pool->slots * (uint64_t)pool->stride; + result = anv_device_alloc_bo(device, "query-pool", size, + ANV_BO_ALLOC_MAPPED | + ANV_BO_ALLOC_SNOOPED, + 0 /* explicit_address */, + &pool->bo); + if (result != VK_SUCCESS) + goto fail; + +#if GFX_VER >= 8 + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + struct mi_builder b; + struct anv_batch batch = { + .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p), + .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset, + }; + batch.next = batch.start; + + mi_builder_init(&b, device->info, &batch); + mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG), + mi_imm(p * (uint64_t)pool->pass_size)); + anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); + } + } +#endif + + *pQueryPool = anv_query_pool_to_handle(pool); + + return VK_SUCCESS; + + fail: + vk_free2(&device->vk.alloc, pAllocator, pool); + + return result; +} + +void genX(DestroyQueryPool)( + VkDevice _device, + VkQueryPool _pool, + const VkAllocationCallbacks* pAllocator) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_query_pool, pool, _pool); + + if (!pool) + return; + + anv_device_release_bo(device, pool->bo); + vk_object_free(&device->vk, pAllocator, pool); +} + +#if GFX_VER >= 8 +/** + * VK_KHR_performance_query layout : + * + * -------------------------------------------- + * | availability (8b) | | | + * |-------------------------------| | | + * | Small batch loading | | | + * | ANV_PERF_QUERY_OFFSET_REG | | | + * | (24b) | | Pass 0 | + * |-------------------------------| | | + * | some padding (see | | | + * | query_field_layout:alignment) | | | + * |-------------------------------| | | + * | query data | | | + * | (2 * query_field_layout:size) | | | + * |-------------------------------|-- | Query 0 + * | availability (8b) | | | + * |-------------------------------| | | + * | Small batch loading | | | + * | ANV_PERF_QUERY_OFFSET_REG | | | + * | (24b) | | Pass 1 | + * |-------------------------------| | | + * | some padding (see | | | + * | query_field_layout:alignment) | | | + * |-------------------------------| | | + * | query data | | | + * | (2 * query_field_layout:size) | | | + * |-------------------------------|----------- + * | availability (8b) | | | + * |-------------------------------| | | + * | Small batch loading | | | + * | ANV_PERF_QUERY_OFFSET_REG | | | + * | (24b) | | Pass 0 | + * |-------------------------------| | | + * | some padding (see | | | + * | query_field_layout:alignment) | | | + * |-------------------------------| | | + * | query data | | | + * | (2 * query_field_layout:size) | | | + * |-------------------------------|-- | Query 1 + * | ... | | | + * -------------------------------------------- + */ + +static uint64_t +khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass) +{ + return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size; +} + +static uint64_t +khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) +{ + return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size + + pool->data_offset + (end ? pool->snapshot_size : 0); +} + +static struct anv_address +khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass) +{ + return anv_address_add( + (struct anv_address) { .bo = pool->bo, }, + khr_perf_query_availability_offset(pool, query, pass)); +} + +static struct anv_address +khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) +{ + return anv_address_add( + (struct anv_address) { .bo = pool->bo, }, + khr_perf_query_data_offset(pool, query, pass, end)); +} + +static bool +khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) +{ + if (anv_batch_has_error(&cmd_buffer->batch)) + return false; + + if (cmd_buffer->self_mod_locations) + return true; + + struct anv_device *device = cmd_buffer->device; + const struct anv_physical_device *pdevice = device->physical; + + cmd_buffer->self_mod_locations = + vk_alloc(&cmd_buffer->vk.pool->alloc, + pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!cmd_buffer->self_mod_locations) { + anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); + return false; + } + + return true; +} +#endif + +/** + * VK_INTEL_performance_query layout : + * + * --------------------------------- + * | availability (8b) | + * |-------------------------------| + * | marker (8b) | + * |-------------------------------| + * | some padding (see | + * | query_field_layout:alignment) | + * |-------------------------------| + * | query data | + * | (2 * query_field_layout:size) | + * --------------------------------- + */ + +static uint32_t +intel_perf_marker_offset(void) +{ + return 8; +} + +static uint32_t +intel_perf_query_data_offset(struct anv_query_pool *pool, bool end) +{ + return pool->data_offset + (end ? pool->snapshot_size : 0); +} + +static void +cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, + uint32_t value_index, uint64_t result) +{ + if (flags & VK_QUERY_RESULT_64_BIT) { + uint64_t *dst64 = dst_slot; + dst64[value_index] = result; + } else { + uint32_t *dst32 = dst_slot; + dst32[value_index] = result; + } +} + +static void * +query_slot(struct anv_query_pool *pool, uint32_t query) +{ + return pool->bo->map + query * pool->stride; +} + +static bool +query_is_available(struct anv_query_pool *pool, uint32_t query) +{ +#if GFX_VER >= 8 + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + volatile uint64_t *slot = + pool->bo->map + khr_perf_query_availability_offset(pool, query, p); + if (!slot[0]) + return false; + } + return true; + } +#endif + + return *(volatile uint64_t *)query_slot(pool, query); +} + +static VkResult +wait_for_available(struct anv_device *device, + struct anv_query_pool *pool, uint32_t query) +{ + uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC); + + while (os_time_get_nano() < abs_timeout_ns) { + if (query_is_available(pool, query)) + return VK_SUCCESS; + VkResult status = vk_device_check_status(&device->vk); + if (status != VK_SUCCESS) + return status; + } + + return vk_device_set_lost(&device->vk, "query timeout"); +} + +VkResult genX(GetQueryPoolResults)( + VkDevice _device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + size_t dataSize, + void* pData, + VkDeviceSize stride, + VkQueryResultFlags flags) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + + assert(pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || + pool->type == VK_QUERY_TYPE_TIMESTAMP || + pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL || + pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); + + if (vk_device_is_lost(&device->vk)) + return VK_ERROR_DEVICE_LOST; + + if (pData == NULL) + return VK_SUCCESS; + + void *data_end = pData + dataSize; + + VkResult status = VK_SUCCESS; + for (uint32_t i = 0; i < queryCount; i++) { + bool available = query_is_available(pool, firstQuery + i); + + if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { + status = wait_for_available(device, pool, firstQuery + i); + if (status != VK_SUCCESS) { + return status; + } + + available = true; + } + + /* From the Vulkan 1.0.42 spec: + * + * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are + * both not set then no result values are written to pData for + * queries that are in the unavailable state at the time of the call, + * and vkGetQueryPoolResults returns VK_NOT_READY. However, + * availability state is still written to pData for those queries if + * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." + * + * From VK_KHR_performance_query : + * + * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies + * that the result should contain the number of counters that were recorded + * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR" + */ + bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); + + uint32_t idx = 0; + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) { + /* From the Vulkan 1.2.132 spec: + * + * "If VK_QUERY_RESULT_PARTIAL_BIT is set, + * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status + * is unavailable, an intermediate result value between zero and + * the final result value is written to pData for that query." + */ + uint64_t result = available ? slot[2] - slot[1] : 0; + cpu_write_query_result(pData, flags, idx, result); + } + idx++; + break; + } + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint64_t *slot = query_slot(pool, firstQuery + i); + uint32_t statistics = pool->pipeline_statistics; + while (statistics) { + uint32_t stat = u_bit_scan(&statistics); + if (write_results) { + uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if ((device->info->ver == 8 || device->info->verx10 == 75) && + (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) + result >>= 2; + + cpu_write_query_result(pData, flags, idx, result); + } + idx++; + } + assert(idx == util_bitcount(pool->pipeline_statistics)); + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) + cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); + idx++; + if (write_results) + cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); + idx++; + break; + } + + case VK_QUERY_TYPE_TIMESTAMP: { + uint64_t *slot = query_slot(pool, firstQuery + i); + if (write_results) + cpu_write_query_result(pData, flags, idx, slot[1]); + idx++; + break; + } + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + const struct anv_physical_device *pdevice = device->physical; + assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | + VK_QUERY_RESULT_PARTIAL_BIT)) == 0); + for (uint32_t p = 0; p < pool->n_passes; p++) { + const struct intel_perf_query_info *query = pool->pass_query[p]; + struct intel_perf_query_result result; + intel_perf_query_result_clear(&result); + intel_perf_query_result_accumulate_fields(&result, query, + pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false), + pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true), + false /* no_oa_accumulate */); + anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); + } + break; + } +#endif + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + if (!write_results) + break; + const void *query_data = query_slot(pool, firstQuery + i); + const struct intel_perf_query_info *query = &device->physical->perf->queries[0]; + struct intel_perf_query_result result; + intel_perf_query_result_clear(&result); + intel_perf_query_result_accumulate_fields(&result, query, + query_data + intel_perf_query_data_offset(pool, false), + query_data + intel_perf_query_data_offset(pool, true), + false /* no_oa_accumulate */); + intel_perf_query_result_write_mdapi(pData, stride, + device->info, + query, &result); + const uint64_t *marker = query_data + intel_perf_marker_offset(); + intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker); + break; + } + + default: + unreachable("invalid pool type"); + } + + if (!write_results) + status = VK_NOT_READY; + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + cpu_write_query_result(pData, flags, idx, available); + + pData += stride; + if (pData >= data_end) + break; + } + + return status; +} + +static void +emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr) +{ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WritePSDepthCount; + pc.DepthStallEnable = true; + pc.Address = addr; + + if (GFX_VER == 9 && cmd_buffer->device->info->gt == 4) + pc.CommandStreamerStallEnable = true; + } +} + +static void +emit_query_mi_availability(struct mi_builder *b, + struct anv_address addr, + bool available) +{ + mi_store(b, mi_mem64(addr), mi_imm(available)); +} + +static void +emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, + struct anv_address addr, + bool available) +{ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteImmediateData; + pc.Address = addr; + pc.ImmediateData = available; + } +} + +/** + * Goes through a series of consecutive query indices in the given pool + * setting all element values to 0 and emitting them as available. + */ +static void +emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, struct anv_query_pool *pool, + uint32_t first_index, uint32_t num_queries) +{ + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_TIMESTAMP: + /* These queries are written with a PIPE_CONTROL so clear them using the + * PIPE_CONTROL as well so we don't have to synchronize between 2 types + * of operations. + */ + assert((pool->stride % 8) == 0); + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + + for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { + emit_query_pc_availability(cmd_buffer, + anv_address_add(slot_addr, qword * 8), + false); + } + emit_query_pc_availability(cmd_buffer, slot_addr, true); + } + break; + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); + emit_query_mi_availability(b, slot_addr, true); + } + break; + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + for (uint32_t i = 0; i < num_queries; i++) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false), + 0, 2 * pool->snapshot_size); + emit_query_mi_availability(b, + khr_perf_query_availability_address(pool, first_index + i, p), + true); + } + } + break; + } +#endif + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); + emit_query_mi_availability(b, slot_addr, true); + } + break; + + default: + unreachable("Unsupported query type"); + } +} + +void genX(CmdResetQueryPool)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_pc_availability(cmd_buffer, + anv_query_address(pool, firstQuery + i), + false); + } + break; + + case VK_QUERY_TYPE_TIMESTAMP: { + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_pc_availability(cmd_buffer, + anv_query_address(pool, firstQuery + i), + false); + } + + /* Add a CS stall here to make sure the PIPE_CONTROL above has + * completed. Otherwise some timestamps written later with MI_STORE_* + * commands might race with the PIPE_CONTROL in the loop above. + */ + anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, + "vkCmdResetQueryPool of timestamps"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + break; + } + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) + emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); + break; + } + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + emit_query_mi_availability( + &b, + khr_perf_query_availability_address(pool, firstQuery + i, p), + false); + } + } + break; + } +#endif + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) + emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); + break; + } + + default: + unreachable("Unsupported query type"); + } +} + +void genX(ResetQueryPool)( + VkDevice _device, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount) +{ + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + + for (uint32_t i = 0; i < queryCount; i++) { + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { +#if GFX_VER >= 8 + for (uint32_t p = 0; p < pool->n_passes; p++) { + uint64_t *pass_slot = pool->bo->map + + khr_perf_query_availability_offset(pool, firstQuery + i, p); + *pass_slot = 0; + } +#endif + } else { + uint64_t *slot = query_slot(pool, firstQuery + i); + *slot = 0; + } + } +} + +static const uint32_t vk_pipeline_stat_to_reg[] = { + GENX(IA_VERTICES_COUNT_num), + GENX(IA_PRIMITIVES_COUNT_num), + GENX(VS_INVOCATION_COUNT_num), + GENX(GS_INVOCATION_COUNT_num), + GENX(GS_PRIMITIVES_COUNT_num), + GENX(CL_INVOCATION_COUNT_num), + GENX(CL_PRIMITIVES_COUNT_num), + GENX(PS_INVOCATION_COUNT_num), + GENX(HS_INVOCATION_COUNT_num), + GENX(DS_INVOCATION_COUNT_num), + GENX(CS_INVOCATION_COUNT_num), +}; + +static void +emit_pipeline_stat(struct mi_builder *b, uint32_t stat, + struct anv_address addr) +{ + STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == + (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); + + assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); + mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat])); +} + +static void +emit_xfb_query(struct mi_builder *b, uint32_t stream, + struct anv_address addr) +{ + assert(stream < MAX_XFB_STREAMS); + + mi_store(b, mi_mem64(anv_address_add(addr, 0)), + mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8)); + mi_store(b, mi_mem64(anv_address_add(addr, 16)), + mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8)); +} + +static void +emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, + struct anv_query_pool *pool, + struct mi_builder *b, + struct anv_address query_addr, + bool end) +{ + const struct intel_perf_query_field_layout *layout = + &cmd_buffer->device->physical->perf->query_layout; + struct anv_address data_addr = + anv_address_add(query_addr, intel_perf_query_data_offset(pool, end)); + + for (uint32_t f = 0; f < layout->n_fields; f++) { + const struct intel_perf_query_field *field = + &layout->fields[end ? f : (layout->n_fields - 1 - f)]; + + switch (field->type) { + case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: + anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { + rpc.MemoryAddress = anv_address_add(data_addr, field->location); + } + break; + + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: { + struct anv_address addr = anv_address_add(data_addr, field->location); + struct mi_value src = field->size == 8 ? + mi_reg64(field->mmio_offset) : + mi_reg32(field->mmio_offset); + struct mi_value dst = field->size == 8 ? + mi_mem64(addr) : mi_mem32(addr); + mi_store(b, dst, src); + break; + } + + default: + unreachable("Invalid query field"); + break; + } + } +} + +void genX(CmdBeginQuery)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags) +{ + genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); +} + +void genX(CmdBeginQueryIndexedEXT)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + VkQueryControlFlags flags, + uint32_t index) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + struct anv_address query_addr = anv_query_address(pool, query); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); + break; + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), + mi_reg64(GENX(CL_INVOCATION_COUNT_num))); + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + /* TODO: This might only be necessary for certain stats */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + uint32_t statistics = pool->pipeline_statistics; + uint32_t offset = 8; + while (statistics) { + uint32_t stat = u_bit_scan(&statistics); + emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); + offset += 16; + } + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); + break; + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + if (!khr_perf_query_ensure_relocs(cmd_buffer)) + return; + + const struct anv_physical_device *pdevice = cmd_buffer->device->physical; + const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; + + uint32_t reloc_idx = 0; + for (uint32_t end = 0; end < 2; end++) { + for (uint32_t r = 0; r < layout->n_fields; r++) { + const struct intel_perf_query_field *field = + &layout->fields[end ? r : (layout->n_fields - 1 - r)]; + struct mi_value reg_addr = + mi_iadd( + &b, + mi_imm(intel_canonical_address(pool->bo->offset + + khr_perf_query_data_offset(pool, query, 0, end) + + field->location)), + mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); + + if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC && + field->size == 8) { + reg_addr = + mi_iadd( + &b, + mi_imm(intel_canonical_address(pool->bo->offset + + khr_perf_query_data_offset(pool, query, 0, end) + + field->location + 4)), + mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); + } + } + } + + struct mi_value availability_write_offset = + mi_iadd( + &b, + mi_imm( + intel_canonical_address( + pool->bo->offset + + khr_perf_query_availability_offset(pool, query, 0 /* pass */))), + mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + cmd_buffer->self_mod_locations[reloc_idx++] = + mi_store_address(&b, availability_write_offset); + + assert(reloc_idx == pdevice->n_perf_query_commands); + + mi_self_mod_barrier(&b); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + cmd_buffer->perf_query_pool = pool; + + cmd_buffer->perf_reloc_idx = 0; + for (uint32_t r = 0; r < layout->n_fields; r++) { + const struct intel_perf_query_field *field = + &layout->fields[layout->n_fields - 1 - r]; + void *dws; + + switch (field->type) { + case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: + dws = anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_REPORT_PERF_COUNT_length), + GENX(MI_REPORT_PERF_COUNT), + .MemoryAddress = query_addr /* Will be overwritten */); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + break; + + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: + dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_REGISTER_MEM_length), + GENX(MI_STORE_REGISTER_MEM), + .RegisterAddress = field->mmio_offset, + .MemoryAddress = query_addr /* Will be overwritten */ ); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + if (field->size == 8) { + dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_REGISTER_MEM_length), + GENX(MI_STORE_REGISTER_MEM), + .RegisterAddress = field->mmio_offset + 4, + .MemoryAddress = query_addr /* Will be overwritten */ ); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + } + break; + + default: + unreachable("Invalid query field"); + break; + } + } + break; + } +#endif + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false); + break; + } + + default: + unreachable(""); + } +} + +void genX(CmdEndQuery)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query) +{ + genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); +} + +void genX(CmdEndQueryIndexedEXT)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t query, + uint32_t index) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + struct anv_address query_addr = anv_query_address(pool, query); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); + emit_query_pc_availability(cmd_buffer, query_addr, true); + break; + + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + /* Ensure previous commands have completed before capturing the register + * value. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)), + mi_reg64(GENX(CL_INVOCATION_COUNT_num))); + emit_query_mi_availability(&b, query_addr, true); + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + /* TODO: This might only be necessary for certain stats */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + uint32_t statistics = pool->pipeline_statistics; + uint32_t offset = 16; + while (statistics) { + uint32_t stat = u_bit_scan(&statistics); + emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); + offset += 16; + } + + emit_query_mi_availability(&b, query_addr, true); + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); + emit_query_mi_availability(&b, query_addr, true); + break; + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + cmd_buffer->perf_query_pool = pool; + + if (!khr_perf_query_ensure_relocs(cmd_buffer)) + return; + + const struct anv_physical_device *pdevice = cmd_buffer->device->physical; + const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; + + void *dws; + for (uint32_t r = 0; r < layout->n_fields; r++) { + const struct intel_perf_query_field *field = &layout->fields[r]; + + switch (field->type) { + case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: + dws = anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_REPORT_PERF_COUNT_length), + GENX(MI_REPORT_PERF_COUNT), + .MemoryAddress = query_addr /* Will be overwritten */); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + break; + + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: + case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: + dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_REGISTER_MEM_length), + GENX(MI_STORE_REGISTER_MEM), + .RegisterAddress = field->mmio_offset, + .MemoryAddress = query_addr /* Will be overwritten */ ); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + if (field->size == 8) { + dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_REGISTER_MEM_length), + GENX(MI_STORE_REGISTER_MEM), + .RegisterAddress = field->mmio_offset + 4, + .MemoryAddress = query_addr /* Will be overwritten */ ); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); + } + break; + + default: + unreachable("Invalid query field"); + break; + } + } + + dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_DATA_IMM_length), + GENX(MI_STORE_DATA_IMM), + .ImmediateData = true); + _mi_resolve_address_token(&b, + cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], + dws + + GENX(MI_STORE_DATA_IMM_Address_start) / 8); + + assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); + break; + } +#endif + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + uint32_t marker_offset = intel_perf_marker_offset(); + mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)), + mi_imm(cmd_buffer->intel_perf_marker)); + emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true); + emit_query_mi_availability(&b, query_addr, true); + break; + } + + default: + unreachable(""); + } + + /* When multiview is active the spec requires that N consecutive query + * indices are used, where N is the number of active views in the subpass. + * The spec allows that we only write the results to one of the queries + * but we still need to manage result availability for all the query indices. + * Since we only emit a single query for all active views in the + * first index, mark the other query indices as being already available + * with result 0. + */ + if (cmd_buffer->state.gfx.view_mask) { + const uint32_t num_queries = + util_bitcount(cmd_buffer->state.gfx.view_mask); + if (num_queries > 1) + emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); + } +} + +#define TIMESTAMP 0x2358 + +void genX(CmdWriteTimestamp2)( + VkCommandBuffer commandBuffer, + VkPipelineStageFlags2 stage, + VkQueryPool queryPool, + uint32_t query) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + struct anv_address query_addr = anv_query_address(pool, query); + + assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + + if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) { + mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), + mi_reg64(TIMESTAMP)); + emit_query_mi_availability(&b, query_addr, true); + } else { + /* Everything else is bottom-of-pipe */ + cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DestinationAddressType = DAT_PPGTT; + pc.PostSyncOperation = WriteTimestamp; + pc.Address = anv_address_add(query_addr, 8); + + if (GFX_VER == 9 && cmd_buffer->device->info->gt == 4) + pc.CommandStreamerStallEnable = true; + } + emit_query_pc_availability(cmd_buffer, query_addr, true); + } + + + /* When multiview is active the spec requires that N consecutive query + * indices are used, where N is the number of active views in the subpass. + * The spec allows that we only write the results to one of the queries + * but we still need to manage result availability for all the query indices. + * Since we only emit a single query for all active views in the + * first index, mark the other query indices as being already available + * with result 0. + */ + if (cmd_buffer->state.gfx.view_mask) { + const uint32_t num_queries = + util_bitcount(cmd_buffer->state.gfx.view_mask); + if (num_queries > 1) + emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); + } +} + +#if GFX_VERx10 >= 75 + +#define MI_PREDICATE_SRC0 0x2400 +#define MI_PREDICATE_SRC1 0x2408 +#define MI_PREDICATE_RESULT 0x2418 + +/** + * Writes the results of a query to dst_addr is the value at poll_addr is equal + * to the reference value. + */ +static void +gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer, + struct mi_builder *b, + struct anv_address poll_addr, + struct anv_address dst_addr, + uint64_t ref_value, + VkQueryResultFlags flags, + uint32_t value_index, + struct mi_value query_result) +{ + mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr)); + mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value)); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { + mip.LoadOperation = LOAD_LOAD; + mip.CombineOperation = COMBINE_SET; + mip.CompareOperation = COMPARE_SRCS_EQUAL; + } + + if (flags & VK_QUERY_RESULT_64_BIT) { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); + mi_store_if(b, mi_mem64(res_addr), query_result); + } else { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); + mi_store_if(b, mi_mem32(res_addr), query_result); + } +} + +static void +gpu_write_query_result(struct mi_builder *b, + struct anv_address dst_addr, + VkQueryResultFlags flags, + uint32_t value_index, + struct mi_value query_result) +{ + if (flags & VK_QUERY_RESULT_64_BIT) { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); + mi_store(b, mi_mem64(res_addr), query_result); + } else { + struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); + mi_store(b, mi_mem32(res_addr), query_result); + } +} + +static struct mi_value +compute_query_result(struct mi_builder *b, struct anv_address addr) +{ + return mi_isub(b, mi_mem64(anv_address_add(addr, 8)), + mi_mem64(anv_address_add(addr, 0))); +} + +void genX(CmdCopyQueryPoolResults)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + VkBuffer destBuffer, + VkDeviceSize destOffset, + VkDeviceSize destStride, + VkQueryResultFlags flags) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); + + struct mi_builder b; + mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch); + struct mi_value result; + + /* If render target writes are ongoing, request a render target cache flush + * to ensure proper ordering of the commands from the 3d pipe and the + * command streamer. + */ + if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_TILE_CACHE_FLUSH_BIT | + ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, + "CopyQueryPoolResults"); + } + + if ((flags & VK_QUERY_RESULT_WAIT_BIT) || + (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || + /* Occlusion & timestamp queries are written using a PIPE_CONTROL and + * because we're about to copy values from MI commands, we need to + * stall the command streamer to make sure the PIPE_CONTROL values have + * landed, otherwise we could see inconsistent values & availability. + * + * From the vulkan spec: + * + * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of + * previous uses of vkCmdResetQueryPool in the same queue, without + * any additional synchronization." + */ + pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_TIMESTAMP) { + anv_add_pending_pipe_bits(cmd_buffer, + ANV_PIPE_CS_STALL_BIT, + "CopyQueryPoolResults"); + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + + struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); + for (uint32_t i = 0; i < queryCount; i++) { + struct anv_address query_addr = anv_query_address(pool, firstQuery + i); + uint32_t idx = 0; + switch (pool->type) { + case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: + result = compute_query_result(&b, anv_address_add(query_addr, 8)); + /* Like in the case of vkGetQueryPoolResults, if the query is + * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, + * conservatively write 0 as the query result. If the + * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. + */ + gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, + 1 /* available */, flags, idx, result); + if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { + gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, + 0 /* unavailable */, flags, idx, mi_imm(0)); + } + idx++; + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint32_t statistics = pool->pipeline_statistics; + while (statistics) { + uint32_t stat = u_bit_scan(&statistics); + + result = compute_query_result(&b, anv_address_add(query_addr, + idx * 16 + 8)); + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if ((cmd_buffer->device->info->ver == 8 || + cmd_buffer->device->info->verx10 == 75) && + (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { + result = mi_ushr32_imm(&b, result, 2); + } + + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + } + assert(idx == util_bitcount(pool->pipeline_statistics)); + break; + } + + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + result = compute_query_result(&b, anv_address_add(query_addr, 8)); + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + result = compute_query_result(&b, anv_address_add(query_addr, 24)); + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + break; + + case VK_QUERY_TYPE_TIMESTAMP: + result = mi_mem64(anv_address_add(query_addr, 8)); + gpu_write_query_result(&b, dest_addr, flags, idx++, result); + break; + +#if GFX_VER >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + unreachable("Copy KHR performance query results not implemented"); + break; +#endif + + default: + unreachable("unhandled query type"); + } + + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + gpu_write_query_result(&b, dest_addr, flags, idx, + mi_mem64(query_addr)); + } + + dest_addr = anv_address_add(dest_addr, destStride); + } +} + +#else +void genX(CmdCopyQueryPoolResults)( + VkCommandBuffer commandBuffer, + VkQueryPool queryPool, + uint32_t firstQuery, + uint32_t queryCount, + VkBuffer destBuffer, + VkDeviceSize destOffset, + VkDeviceSize destStride, + VkQueryResultFlags flags) +{ + anv_finishme("Queries not yet supported on Ivy Bridge"); +} +#endif diff --git a/src/intel/vulkan_hasvk/genX_state.c b/src/intel/vulkan_hasvk/genX_state.c new file mode 100644 index 00000000000..b568960907d --- /dev/null +++ b/src/intel/vulkan_hasvk/genX_state.c @@ -0,0 +1,1141 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" + +#include "common/intel_aux_map.h" +#include "common/intel_sample_positions.h" +#include "common/intel_pixel_hash.h" +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +#include "vk_standard_sample_locations.h" +#include "vk_util.h" + +static void +genX(emit_slice_hashing_state)(struct anv_device *device, + struct anv_batch *batch) +{ +#if GFX_VER == 11 + /* Gfx11 hardware has two pixel pipes at most. */ + for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++) + assert(device->info->ppipe_subslices[i] == 0); + + if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1]) + return; + + if (!device->slice_hash.alloc_size) { + unsigned size = GENX(SLICE_HASH_TABLE_length) * 4; + device->slice_hash = + anv_state_pool_alloc(&device->dynamic_state_pool, size, 64); + + const bool flip = device->info->ppipe_subslices[0] < + device->info->ppipe_subslices[1]; + struct GENX(SLICE_HASH_TABLE) table; + intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]); + + GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table); + } + + anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) { + ptr.SliceHashStatePointerValid = true; + ptr.SliceHashTableStatePointer = device->slice_hash.offset; + } + + anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) { + mode.SliceHashingTableEnable = true; + } +#elif GFX_VERx10 == 120 + /* For each n calculate ppipes_of[n], equal to the number of pixel pipes + * present with n active dual subslices. + */ + unsigned ppipes_of[3] = {}; + + for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) { + for (unsigned p = 0; p < 3; p++) + ppipes_of[n] += (device->info->ppipe_subslices[p] == n); + } + + /* Gfx12 has three pixel pipes. */ + for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) + assert(device->info->ppipe_subslices[p] == 0); + + if (ppipes_of[2] == 3 || ppipes_of[0] == 2) { + /* All three pixel pipes have the maximum number of active dual + * subslices, or there is only one active pixel pipe: Nothing to do. + */ + return; + } + + anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) { + p.SliceHashControl[0] = TABLE_0; + + if (ppipes_of[2] == 2 && ppipes_of[0] == 1) + intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]); + else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1) + intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]); + + if (ppipes_of[2] == 2 && ppipes_of[1] == 1) + intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]); + else if (ppipes_of[2] == 2 && ppipes_of[0] == 1) + intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]); + else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1) + intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]); + else + unreachable("Illegal fusing."); + } + + anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) { + p.SubsliceHashingTableEnable = true; + p.SubsliceHashingTableEnableMask = true; + } +#elif GFX_VERx10 == 125 + uint32_t ppipe_mask = 0; + for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) { + if (device->info->ppipe_subslices[p]) + ppipe_mask |= (1u << p); + } + assert(ppipe_mask); + + if (!device->slice_hash.alloc_size) { + unsigned size = GENX(SLICE_HASH_TABLE_length) * 4; + device->slice_hash = + anv_state_pool_alloc(&device->dynamic_state_pool, size, 64); + + struct GENX(SLICE_HASH_TABLE) table; + + /* Note that the hardware expects an array with 7 tables, each + * table is intended to specify the pixel pipe hashing behavior + * for every possible slice count between 2 and 8, however that + * doesn't actually work, among other reasons due to hardware + * bugs that will cause the GPU to erroneously access the table + * at the wrong index in some cases, so in practice all 7 tables + * need to be initialized to the same value. + */ + for (unsigned i = 0; i < 7; i++) + intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]); + + GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table); + } + + anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) { + ptr.SliceHashStatePointerValid = true; + ptr.SliceHashTableStatePointer = device->slice_hash.offset; + } + + anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) { + mode.SliceHashingTableEnable = true; + mode.SliceHashingTableEnableMask = true; + mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ? + hashing32x32 : NormalMode); + mode.CrossSliceHashingModeMask = -1; + } +#endif +} + +static void +init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch) +{ + UNUSED struct anv_device *device = queue->device; + +#if GFX_VER >= 11 + /* Starting with GFX version 11, SLM is no longer part of the L3$ config + * so it never changes throughout the lifetime of the VkDevice. + */ + const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info); + genX(emit_l3_config)(batch, device, cfg); + device->l3_config = cfg; +#endif + +#if GFX_VERx10 >= 125 + /* GEN:BUG:1607854226: + * + * Non-pipelined state has issues with not applying in MEDIA/GPGPU mode. + * Fortunately, we always start the context off in 3D mode. + */ + uint32_t mocs = device->isl_dev.mocs.internal; + anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) { + sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 }; + sba.GeneralStateBufferSize = 0xfffff; + sba.GeneralStateMOCS = mocs; + sba.GeneralStateBaseAddressModifyEnable = true; + sba.GeneralStateBufferSizeModifyEnable = true; + + sba.StatelessDataPortAccessMOCS = mocs; + + sba.SurfaceStateBaseAddress = + (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS }; + sba.SurfaceStateMOCS = mocs; + sba.SurfaceStateBaseAddressModifyEnable = true; + + sba.DynamicStateBaseAddress = + (struct anv_address) { .offset = DYNAMIC_STATE_POOL_MIN_ADDRESS }; + sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096; + sba.DynamicStateMOCS = mocs; + sba.DynamicStateBaseAddressModifyEnable = true; + sba.DynamicStateBufferSizeModifyEnable = true; + + sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 }; + sba.IndirectObjectBufferSize = 0xfffff; + sba.IndirectObjectMOCS = mocs; + sba.IndirectObjectBaseAddressModifyEnable = true; + sba.IndirectObjectBufferSizeModifyEnable = true; + + sba.InstructionBaseAddress = + (struct anv_address) { .offset = INSTRUCTION_STATE_POOL_MIN_ADDRESS }; + sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096; + sba.InstructionMOCS = mocs; + sba.InstructionBaseAddressModifyEnable = true; + sba.InstructionBuffersizeModifyEnable = true; + + sba.BindlessSurfaceStateBaseAddress = + (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + sba.BindlessSurfaceStateMOCS = mocs; + sba.BindlessSurfaceStateBaseAddressModifyEnable = true; + + sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; + sba.BindlessSamplerStateMOCS = mocs; + sba.BindlessSamplerStateBaseAddressModifyEnable = true; + sba.BindlessSamplerStateBufferSize = 0; + + sba.L1CacheControl = L1CC_WB; + } +#endif +} + +static VkResult +init_render_queue_state(struct anv_queue *queue) +{ + struct anv_device *device = queue->device; + uint32_t cmds[128]; + struct anv_batch batch = { + .start = cmds, + .next = cmds, + .end = (void *) cmds + sizeof(cmds), + }; + + anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) { +#if GFX_VER >= 9 + ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; + ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; +#endif + ps.PipelineSelection = _3D; + } + +#if GFX_VER == 9 + anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) { + cm1.FloatBlendOptimizationEnable = true; + cm1.FloatBlendOptimizationEnableMask = true; + cm1.MSCRAWHazardAvoidanceBit = true; + cm1.MSCRAWHazardAvoidanceBitMask = true; + cm1.PartialResolveDisableInVC = true; + cm1.PartialResolveDisableInVCMask = true; + } +#endif + + anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa); + + anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { + rect.ClippedDrawingRectangleYMin = 0; + rect.ClippedDrawingRectangleXMin = 0; + rect.ClippedDrawingRectangleYMax = UINT16_MAX; + rect.ClippedDrawingRectangleXMax = UINT16_MAX; + rect.DrawingRectangleOriginY = 0; + rect.DrawingRectangleOriginX = 0; + } + +#if GFX_VER >= 8 + anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck); + + genX(emit_sample_pattern)(&batch, NULL); + + /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the + * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer + * Clear." It mentions that the packet overrides GPU state for the clear + * operation and needs to be reset to 0s to clear the overrides. Depending + * on the kernel, we may not get a context with the state for this packet + * zeroed. Do it ourselves just in case. We've observed this to prevent a + * number of GPU hangs on ICL. + */ + anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp); +#endif + +#if GFX_VER == 11 + /* The default behavior of bit 5 "Headerless Message for Pre-emptable + * Contexts" in SAMPLER MODE register is set to 0, which means + * headerless sampler messages are not allowed for pre-emptable + * contexts. Set the bit 5 to 1 to allow them. + */ + anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) { + sm.HeaderlessMessageforPreemptableContexts = true; + sm.HeaderlessMessageforPreemptableContextsMask = true; + } + + /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in + * HALF_SLICE_CHICKEN7 register. + */ + anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) { + hsc7.EnabledTexelOffsetPrecisionFix = true; + hsc7.EnabledTexelOffsetPrecisionFixMask = true; + } + + anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) { + tcc.L3DataPartialWriteMergingEnable = true; + tcc.ColorZPartialWriteMergingEnable = true; + tcc.URBPartialWriteMergingEnable = true; + tcc.TCDisable = true; + } +#endif + genX(emit_slice_hashing_state)(device, &batch); + +#if GFX_VER >= 11 + /* hardware specification recommends disabling repacking for + * the compatibility with decompression mechanism in display controller. + */ + if (device->info->disable_ccs_repack) { + anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) { + cm0.DisableRepackingforCompression = true; + cm0.DisableRepackingforCompressionMask = true; + } + } + + /* an unknown issue is causing vs push constants to become + * corrupted during object-level preemption. For now, restrict + * to command buffer level preemption to avoid rendering + * corruption. + */ + anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) { + cc1.ReplayMode = MidcmdbufferPreemption; + cc1.ReplayModeMask = true; + +#if GFX_VERx10 == 120 + cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true; + cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true; +#endif + } + +#if GFX_VERx10 == 120 + /* Wa_1806527549 says to disable the following HiZ optimization when the + * depth buffer is D16_UNORM. We've found the WA to help with more depth + * buffer configurations however, so we always disable it just to be safe. + */ + anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) { + reg.HZDepthTestLEGEOptimizationDisable = true; + reg.HZDepthTestLEGEOptimizationDisableMask = true; + } +#endif + +#if GFX_VERx10 < 125 +#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3) +#else +#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1) +#endif + + /* Enable the new line drawing algorithm that produces higher quality + * lines. + */ + anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) { + c3.AALineQualityFix = true; + c3.AALineQualityFixMask = true; + } +#endif + +#if GFX_VER == 12 + if (device->info->has_aux_map) { + uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx); + assert(aux_base_addr % (32 * 1024) == 0); + anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num); + lri.DataDWord = aux_base_addr & 0xffffffff; + } + anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4; + lri.DataDWord = aux_base_addr >> 32; + } + } +#endif + + /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so + * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address. + * + * This is only safe on kernels with context isolation support. + */ + if (GFX_VER >= 8 && device->physical->has_context_isolation) { +#if GFX_VER >= 9 + anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) { + csdm2.CONSTANT_BUFFERAddressOffsetDisable = true; + csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true; + } +#elif GFX_VER == 8 + anv_batch_write_reg(&batch, GENX(INSTPM), instpm) { + instpm.CONSTANT_BUFFERAddressOffsetDisable = true; + instpm.CONSTANT_BUFFERAddressOffsetDisableMask = true; + } +#endif + } + + init_common_queue_state(queue, &batch); + + anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); + + assert(batch.next <= batch.end); + + return anv_queue_submit_simple_batch(queue, &batch); +} + +static VkResult +init_compute_queue_state(struct anv_queue *queue) +{ + struct anv_batch batch; + + uint32_t cmds[64]; + batch.start = batch.next = cmds; + batch.end = (void *) cmds + sizeof(cmds); + + anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) { +#if GFX_VER >= 9 + ps.MaskBits = 3; +#endif +#if GFX_VER >= 11 + ps.MaskBits |= 0x10; + ps.MediaSamplerDOPClockGateEnable = true; +#endif + ps.PipelineSelection = GPGPU; + } + + init_common_queue_state(queue, &batch); + + anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); + + assert(batch.next <= batch.end); + + return anv_queue_submit_simple_batch(queue, &batch); +} + +void +genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice) +{ + assert(pdevice->info.verx10 == GFX_VERx10); +} + +VkResult +genX(init_device_state)(struct anv_device *device) +{ + VkResult res; + + device->slice_hash = (struct anv_state) { 0 }; + for (uint32_t i = 0; i < device->queue_count; i++) { + struct anv_queue *queue = &device->queues[i]; + switch (queue->family->engine_class) { + case I915_ENGINE_CLASS_RENDER: + res = init_render_queue_state(queue); + break; + case I915_ENGINE_CLASS_COMPUTE: + res = init_compute_queue_state(queue); + break; + default: + res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED); + break; + } + if (res != VK_SUCCESS) + return res; + } + + return res; +} + +#if GFX_VERx10 >= 125 +#define maybe_for_each_shading_rate_op(name) \ + for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \ + name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \ + name++) +#elif GFX_VER >= 12 +#define maybe_for_each_shading_rate_op(name) +#endif + +/* Rather than reemitting the CPS_STATE structure everything those changes and + * for as many viewports as needed, we can just prepare all possible cases and + * just pick the right offset from the prepacked states when needed. + */ +void +genX(init_cps_device_state)(struct anv_device *device) +{ +#if GFX_VER >= 12 + void *cps_state_ptr = device->cps_states.map; + + /* Disabled CPS mode */ + for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) { + struct GENX(CPS_STATE) cps_state = { + .CoarsePixelShadingMode = CPS_MODE_CONSTANT, + .MinCPSizeX = 1, + .MinCPSizeY = 1, +#if GFX_VERx10 >= 125 + .Combiner0OpcodeforCPsize = PASSTHROUGH, + .Combiner1OpcodeforCPsize = PASSTHROUGH, +#endif /* GFX_VERx10 >= 125 */ + + }; + + GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state); + cps_state_ptr += GENX(CPS_STATE_length) * 4; + } + + maybe_for_each_shading_rate_op(op0) { + maybe_for_each_shading_rate_op(op1) { + for (uint32_t x = 1; x <= 4; x *= 2) { + for (uint32_t y = 1; y <= 4; y *= 2) { + struct GENX(CPS_STATE) cps_state = { + .CoarsePixelShadingMode = CPS_MODE_CONSTANT, + .MinCPSizeX = x, + .MinCPSizeY = y, + }; + +#if GFX_VERx10 >= 125 + static const uint32_t combiner_ops[] = { + [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR] = PASSTHROUGH, + [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE, + [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR] = HIGH_QUALITY, + [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR] = LOW_QUALITY, + [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR] = RELATIVE, + }; + + cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0]; + cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1]; +#endif /* GFX_VERx10 >= 125 */ + + for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) { + GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state); + cps_state_ptr += GENX(CPS_STATE_length) * 4; + } + } + } + } + } +#endif /* GFX_VER >= 12 */ +} + +#if GFX_VER >= 12 +static uint32_t +get_cps_state_offset(struct anv_device *device, bool cps_enabled, + const struct vk_fragment_shading_rate_state *fsr) +{ + if (!cps_enabled) + return device->cps_states.offset; + + uint32_t offset; + static const uint32_t size_index[] = { + [1] = 0, + [2] = 1, + [4] = 2, + }; + +#if GFX_VERx10 >= 125 + offset = + 1 + /* skip disabled */ + fsr->combiner_ops[0] * 5 * 3 * 3 + + fsr->combiner_ops[1] * 3 * 3 + + size_index[fsr->fragment_size.width] * 3 + + size_index[fsr->fragment_size.height]; +#else + offset = + 1 + /* skip disabled */ + size_index[fsr->fragment_size.width] * 3 + + size_index[fsr->fragment_size.height]; +#endif + + offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4; + + return device->cps_states.offset + offset; +} +#endif /* GFX_VER >= 12 */ + +void +genX(emit_l3_config)(struct anv_batch *batch, + const struct anv_device *device, + const struct intel_l3_config *cfg) +{ + UNUSED const struct intel_device_info *devinfo = device->info; + +#if GFX_VER >= 8 + +#if GFX_VER >= 12 +#define L3_ALLOCATION_REG GENX(L3ALLOC) +#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num) +#else +#define L3_ALLOCATION_REG GENX(L3CNTLREG) +#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num) +#endif + + anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) { + if (cfg == NULL) { +#if GFX_VER >= 12 + l3cr.L3FullWayAllocationEnable = true; +#else + unreachable("Invalid L3$ config"); +#endif + } else { +#if GFX_VER < 11 + l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM]; +#endif +#if GFX_VER == 11 + /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be + * set in L3CNTLREG register. The default setting of the bit is not + * the desirable behavior. + */ + l3cr.ErrorDetectionBehaviorControl = true; + l3cr.UseFullWays = true; +#endif /* GFX_VER == 11 */ + assert(cfg->n[INTEL_L3P_IS] == 0); + assert(cfg->n[INTEL_L3P_C] == 0); + assert(cfg->n[INTEL_L3P_T] == 0); + l3cr.URBAllocation = cfg->n[INTEL_L3P_URB]; + l3cr.ROAllocation = cfg->n[INTEL_L3P_RO]; + l3cr.DCAllocation = cfg->n[INTEL_L3P_DC]; + l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL]; + } + } + +#else /* GFX_VER < 8 */ + + const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL]; + const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] || + cfg->n[INTEL_L3P_ALL]; + + assert(!cfg->n[INTEL_L3P_ALL]); + + /* When enabled SLM only uses a portion of the L3 on half of the banks, + * the matching space on the remaining banks has to be allocated to a + * client (URB for all validated configurations) set to the + * lower-bandwidth 2-bank address hashing mode. + */ + const bool urb_low_bw = cfg->n[INTEL_L3P_SLM] && devinfo->platform != INTEL_PLATFORM_BYT; + assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]); + + /* Minimum number of ways that can be allocated to the URB. */ + const unsigned n0_urb = devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0; + assert(cfg->n[INTEL_L3P_URB] >= n0_urb); + + anv_batch_write_reg(batch, GENX(L3SQCREG1), l3sqc) { + l3sqc.ConvertDC_UC = !has_dc; + l3sqc.ConvertIS_UC = !has_is; + l3sqc.ConvertC_UC = !has_c; + l3sqc.ConvertT_UC = !has_t; +#if GFX_VERx10 == 75 + l3sqc.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT; +#else + l3sqc.L3SQGeneralPriorityCreditInitialization = + devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT; +#endif + l3sqc.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT; + } + + anv_batch_write_reg(batch, GENX(L3CNTLREG2), l3cr2) { + l3cr2.SLMEnable = cfg->n[INTEL_L3P_SLM]; + l3cr2.URBLowBandwidth = urb_low_bw; + l3cr2.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb; +#if !GFX_VERx10 == 75 + l3cr2.ALLAllocation = cfg->n[INTEL_L3P_ALL]; +#endif + l3cr2.ROAllocation = cfg->n[INTEL_L3P_RO]; + l3cr2.DCAllocation = cfg->n[INTEL_L3P_DC]; + } + + anv_batch_write_reg(batch, GENX(L3CNTLREG3), l3cr3) { + l3cr3.ISAllocation = cfg->n[INTEL_L3P_IS]; + l3cr3.ISLowBandwidth = 0; + l3cr3.CAllocation = cfg->n[INTEL_L3P_C]; + l3cr3.CLowBandwidth = 0; + l3cr3.TAllocation = cfg->n[INTEL_L3P_T]; + l3cr3.TLowBandwidth = 0; + } + +#if GFX_VERx10 == 75 + if (device->physical->cmd_parser_version >= 4) { + /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep + * them disabled to avoid crashing the system hard. + */ + anv_batch_write_reg(batch, GENX(SCRATCH1), s1) { + s1.L3AtomicDisable = !has_dc; + } + anv_batch_write_reg(batch, GENX(CHICKEN3), c3) { + c3.L3AtomicDisableMask = true; + c3.L3AtomicDisable = !has_dc; + } + } +#endif /* GFX_VERx10 == 75 */ + +#endif /* GFX_VER < 8 */ +} + +void +genX(emit_multisample)(struct anv_batch *batch, uint32_t samples, + const struct vk_sample_locations_state *sl) +{ + if (sl != NULL) { + assert(sl->per_pixel == samples); + assert(sl->grid_size.width == 1); + assert(sl->grid_size.height == 1); + } else { + sl = vk_standard_sample_locations_state(samples); + } + + anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) { + ms.NumberofMultisamples = __builtin_ffs(samples) - 1; + + ms.PixelLocation = CENTER; +#if GFX_VER >= 8 + /* The PRM says that this bit is valid only for DX9: + * + * SW can choose to set this bit only for DX9 API. DX10/OGL API's + * should not have any effect by setting or not setting this bit. + */ + ms.PixelPositionOffsetEnable = false; +#else + switch (samples) { + case 1: + INTEL_SAMPLE_POS_1X_ARRAY(ms.Sample, sl->locations); + break; + case 2: + INTEL_SAMPLE_POS_2X_ARRAY(ms.Sample, sl->locations); + break; + case 4: + INTEL_SAMPLE_POS_4X_ARRAY(ms.Sample, sl->locations); + break; + case 8: + INTEL_SAMPLE_POS_8X_ARRAY(ms.Sample, sl->locations); + break; + default: + break; + } +#endif + } +} + +#if GFX_VER >= 8 +void +genX(emit_sample_pattern)(struct anv_batch *batch, + const struct vk_sample_locations_state *sl) +{ + assert(sl == NULL || sl->grid_size.width == 1); + assert(sl == NULL || sl->grid_size.height == 1); + + /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and + * VkPhysicalDeviceFeatures::standardSampleLocations. + */ + anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) { + /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says: + * + * "When programming the sample offsets (for NUMSAMPLES_4 or _8 + * and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3 + * (or 7 for 8X, or 15 for 16X) must have monotonically increasing + * distance from the pixel center. This is required to get the + * correct centroid computation in the device." + * + * However, the Vulkan spec seems to require that the the samples occur + * in the order provided through the API. The standard sample patterns + * have the above property that they have monotonically increasing + * distances from the center but client-provided ones do not. As long as + * this only affects centroid calculations as the docs say, we should be + * ok because OpenGL and Vulkan only require that the centroid be some + * lit sample and that it's the same for all samples in a pixel; they + * have no requirement that it be the one closest to center. + */ + for (uint32_t i = 1; i <= (GFX_VER >= 9 ? 16 : 8); i *= 2) { + switch (i) { + case VK_SAMPLE_COUNT_1_BIT: + if (sl && sl->per_pixel == i) { + INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations); + } else { + INTEL_SAMPLE_POS_1X(sp._1xSample); + } + break; + case VK_SAMPLE_COUNT_2_BIT: + if (sl && sl->per_pixel == i) { + INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations); + } else { + INTEL_SAMPLE_POS_2X(sp._2xSample); + } + break; + case VK_SAMPLE_COUNT_4_BIT: + if (sl && sl->per_pixel == i) { + INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations); + } else { + INTEL_SAMPLE_POS_4X(sp._4xSample); + } + break; + case VK_SAMPLE_COUNT_8_BIT: + if (sl && sl->per_pixel == i) { + INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations); + } else { + INTEL_SAMPLE_POS_8X(sp._8xSample); + } + break; +#if GFX_VER >= 9 + case VK_SAMPLE_COUNT_16_BIT: + if (sl && sl->per_pixel == i) { + INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations); + } else { + INTEL_SAMPLE_POS_16X(sp._16xSample); + } + break; +#endif + default: + unreachable("Invalid sample count"); + } + } + } +} +#endif + +#if GFX_VER >= 11 +void +genX(emit_shading_rate)(struct anv_batch *batch, + const struct anv_graphics_pipeline *pipeline, + const struct vk_fragment_shading_rate_state *fsr) +{ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch; + +#if GFX_VER == 11 + anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) { + cps.CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE; + if (cps_enable) { + cps.MinCPSizeX = fsr->fragment_size.width; + cps.MinCPSizeY = fsr->fragment_size.height; + } + } +#elif GFX_VER >= 12 + /* TODO: we can optimize this flush in the following cases: + * + * In the case where the last geometry shader emits a value that is not + * constant, we can avoid this stall because we can synchronize the + * pixel shader internally with + * 3DSTATE_PS::EnablePSDependencyOnCPsizeChange. + * + * If we know that the previous pipeline and the current one are using + * the same fragment shading rate. + */ + anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { +#if GFX_VERx10 >= 125 + pc.PSSStallSyncEnable = true; +#else + pc.PSDSyncEnable = true; +#endif + } + + anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) { + struct anv_device *device = pipeline->base.device; + + cps.CoarsePixelShadingStateArrayPointer = + get_cps_state_offset(device, cps_enable, fsr); + } +#endif +} +#endif /* GFX_VER >= 11 */ + +static uint32_t +vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable) +{ + switch (filter) { + default: + unreachable("Invalid filter"); + case VK_FILTER_NEAREST: + return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST; + case VK_FILTER_LINEAR: + return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR; + } +} + +static uint32_t +vk_to_intel_max_anisotropy(float ratio) +{ + return (anv_clamp_f(ratio, 2, 16) - 2) / 2; +} + +static const uint32_t vk_to_intel_mipmap_mode[] = { + [VK_SAMPLER_MIPMAP_MODE_NEAREST] = MIPFILTER_NEAREST, + [VK_SAMPLER_MIPMAP_MODE_LINEAR] = MIPFILTER_LINEAR +}; + +static const uint32_t vk_to_intel_tex_address[] = { + [VK_SAMPLER_ADDRESS_MODE_REPEAT] = TCM_WRAP, + [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR, + [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE] = TCM_CLAMP, + [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE, + [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER, +}; + +/* Vulkan specifies the result of shadow comparisons as: + * 1 if ref texel, + * 0 otherwise. + * + * The hardware does: + * 0 if texel ref, + * 1 otherwise. + * + * So, these look a bit strange because there's both a negation + * and swapping of the arguments involved. + */ +static const uint32_t vk_to_intel_shadow_compare_op[] = { + [VK_COMPARE_OP_NEVER] = PREFILTEROP_ALWAYS, + [VK_COMPARE_OP_LESS] = PREFILTEROP_LEQUAL, + [VK_COMPARE_OP_EQUAL] = PREFILTEROP_NOTEQUAL, + [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROP_LESS, + [VK_COMPARE_OP_GREATER] = PREFILTEROP_GEQUAL, + [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROP_EQUAL, + [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROP_GREATER, + [VK_COMPARE_OP_ALWAYS] = PREFILTEROP_NEVER, +}; + +#if GFX_VER >= 9 +static const uint32_t vk_to_intel_sampler_reduction_mode[] = { + [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER, + [VK_SAMPLER_REDUCTION_MODE_MIN] = MINIMUM, + [VK_SAMPLER_REDUCTION_MODE_MAX] = MAXIMUM, +}; +#endif + +VkResult genX(CreateSampler)( + VkDevice _device, + const VkSamplerCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSampler* pSampler) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct anv_sampler *sampler; + + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO); + + sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler), + VK_OBJECT_TYPE_SAMPLER); + if (!sampler) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + sampler->n_planes = 1; + + uint32_t border_color_stride = GFX_VERx10 == 75 ? 512 : 64; + uint32_t border_color_offset; + ASSERTED bool has_custom_color = false; + if (pCreateInfo->borderColor <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) { + border_color_offset = device->border_colors.offset + + pCreateInfo->borderColor * + border_color_stride; + } else { + assert(GFX_VER >= 8); + sampler->custom_border_color = + anv_state_reserved_pool_alloc(&device->custom_border_colors); + border_color_offset = sampler->custom_border_color.offset; + } + +#if GFX_VER >= 9 + unsigned sampler_reduction_mode = STD_FILTER; + bool enable_sampler_reduction = false; +#endif + + vk_foreach_struct_const(ext, pCreateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO: { + VkSamplerYcbcrConversionInfo *pSamplerConversion = + (VkSamplerYcbcrConversionInfo *) ext; + ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, + pSamplerConversion->conversion); + + /* Ignore conversion for non-YUV formats. This fulfills a requirement + * for clients that want to utilize same code path for images with + * external formats (VK_FORMAT_UNDEFINED) and "regular" RGBA images + * where format is known. + */ + if (conversion == NULL || !conversion->format->can_ycbcr) + break; + + sampler->n_planes = conversion->format->n_planes; + sampler->conversion = conversion; + break; + } +#if GFX_VER >= 9 + case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: { + VkSamplerReductionModeCreateInfo *sampler_reduction = + (VkSamplerReductionModeCreateInfo *) ext; + sampler_reduction_mode = + vk_to_intel_sampler_reduction_mode[sampler_reduction->reductionMode]; + enable_sampler_reduction = true; + break; + } +#endif + case VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT: { + VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color = + (VkSamplerCustomBorderColorCreateInfoEXT *) ext; + if (sampler->custom_border_color.map == NULL) + break; + + union isl_color_value color = { .u32 = { + custom_border_color->customBorderColor.uint32[0], + custom_border_color->customBorderColor.uint32[1], + custom_border_color->customBorderColor.uint32[2], + custom_border_color->customBorderColor.uint32[3], + } }; + + const struct anv_format *format_desc = + custom_border_color->format != VK_FORMAT_UNDEFINED ? + anv_get_format(custom_border_color->format) : NULL; + + /* For formats with a swizzle, it does not carry over to the sampler + * for border colors, so we need to do the swizzle ourselves here. + */ + if (format_desc && format_desc->n_planes == 1 && + !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) { + const struct anv_format_plane *fmt_plane = &format_desc->planes[0]; + + assert(!isl_format_has_int_channel(fmt_plane->isl_format)); + color = isl_color_value_swizzle(color, fmt_plane->swizzle, true); + } + + memcpy(sampler->custom_border_color.map, color.u32, sizeof(color)); + has_custom_color = true; + break; + } + case VK_STRUCTURE_TYPE_SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT: + break; + default: + anv_debug_ignored_stype(ext->sType); + break; + } + } + + assert((sampler->custom_border_color.map == NULL) || has_custom_color); + + if (device->physical->has_bindless_samplers) { + /* If we have bindless, allocate enough samplers. We allocate 32 bytes + * for each sampler instead of 16 bytes because we want all bindless + * samplers to be 32-byte aligned so we don't have to use indirect + * sampler messages on them. + */ + sampler->bindless_state = + anv_state_pool_alloc(&device->dynamic_state_pool, + sampler->n_planes * 32, 32); + } + + const bool seamless_cube = + !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT); + + for (unsigned p = 0; p < sampler->n_planes; p++) { + const bool plane_has_chroma = + sampler->conversion && sampler->conversion->format->planes[p].has_chroma; + const VkFilter min_filter = + plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->minFilter; + const VkFilter mag_filter = + plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->magFilter; + const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST; + const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST; + /* From Broadwell PRM, SAMPLER_STATE: + * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces." + */ + const bool isl_format_is_planar_yuv = sampler->conversion && + isl_format_is_yuv(sampler->conversion->format->planes[0].isl_format) && + isl_format_is_planar(sampler->conversion->format->planes[0].isl_format); + + const uint32_t mip_filter_mode = + isl_format_is_planar_yuv ? + MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode]; + + struct GENX(SAMPLER_STATE) sampler_state = { + .SamplerDisable = false, + .TextureBorderColorMode = DX10OGL, + +#if GFX_VER >= 11 + .CPSLODCompensationEnable = true, +#endif + +#if GFX_VER >= 8 + .LODPreClampMode = CLAMP_MODE_OGL, +#else + .LODPreClampEnable = CLAMP_ENABLE_OGL, +#endif + +#if GFX_VER == 8 + .BaseMipLevel = 0.0, +#endif + .MipModeFilter = mip_filter_mode, + .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable), + .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable), + .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996), + .AnisotropicAlgorithm = + pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY, + .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14), + .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14), + .ChromaKeyEnable = 0, + .ChromaKeyIndex = 0, + .ChromaKeyMode = 0, + .ShadowFunction = + vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ? + pCreateInfo->compareOp : VK_COMPARE_OP_NEVER], + .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED, + + .BorderColorPointer = border_color_offset, + +#if GFX_VER >= 8 + .LODClampMagnificationMode = MIPNONE, +#endif + + .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy), + .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding, + .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding, + .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding, + .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding, + .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding, + .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding, + .TrilinearFilterQuality = 0, + .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates, + .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU], + .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV], + .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW], + +#if GFX_VER >= 9 + .ReductionType = sampler_reduction_mode, + .ReductionTypeEnable = enable_sampler_reduction, +#endif + }; + + GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state); + + if (sampler->bindless_state.map) { + memcpy(sampler->bindless_state.map + p * 32, + sampler->state[p], GENX(SAMPLER_STATE_length) * 4); + } + } + + *pSampler = anv_sampler_to_handle(sampler); + + return VK_SUCCESS; +} diff --git a/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c b/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c new file mode 100644 index 00000000000..55221799f32 --- /dev/null +++ b/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c @@ -0,0 +1,314 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" +#include "vk_format.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +static uint32_t +get_depth_format(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx; + + switch (gfx->depth_att.vk_format) { + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_D16_UNORM_S8_UINT: + return D16_UNORM; + + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D24_UNORM_S8_UINT: + return D24_UNORM_X8_UINT; + + case VK_FORMAT_D32_SFLOAT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return D32_FLOAT; + + default: + return D16_UNORM; + } +} + +void +genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) { + /* Take dynamic primitive topology in to account with + * 3DSTATE_SF::MultisampleRasterizationMode + */ + VkPolygonMode dynamic_raster_mode = + genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, + dyn->ia.primitive_topology); + uint32_t ms_rast_mode = + genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode); + + bool aa_enable = anv_rasterization_aa_mode(dynamic_raster_mode, + pipeline->line_mode); + + uint32_t sf_dw[GENX(3DSTATE_SF_length)]; + struct GENX(3DSTATE_SF) sf = { + GENX(3DSTATE_SF_header), + .DepthBufferSurfaceFormat = get_depth_format(cmd_buffer), + .LineWidth = dyn->rs.line.width, + .AntialiasingEnable = aa_enable, + .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode], + .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face], + .MultisampleRasterizationMode = ms_rast_mode, + .GlobalDepthOffsetEnableSolid = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetEnableWireframe = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetEnablePoint = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetConstant = dyn->rs.depth_bias.constant, + .GlobalDepthOffsetScale = dyn->rs.depth_bias.slope, + .GlobalDepthOffsetClamp = dyn->rs.depth_bias.clamp, + }; + GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf); + + anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx7.sf); + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + GENX(COLOR_CALC_STATE_length) * 4, + 64); + struct GENX(COLOR_CALC_STATE) cc = { + .BlendConstantColorRed = dyn->cb.blend_constants[0], + .BlendConstantColorGreen = dyn->cb.blend_constants[1], + .BlendConstantColorBlue = dyn->cb.blend_constants[2], + .BlendConstantColorAlpha = dyn->cb.blend_constants[3], + .StencilReferenceValue = dyn->ds.stencil.front.reference & 0xff, + .BackfaceStencilReferenceValue = dyn->ds.stencil.back.reference & 0xff, + }; + GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) { + ccp.ColorCalcStatePointer = cc_state.offset; + } + } + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) { + ls.LineStipplePattern = dyn->rs.line.stipple.pattern; + ls.LineStippleInverseRepeatCount = + 1.0f / MAX2(1, dyn->rs.line.stipple.factor); + ls.LineStippleRepeatCount = dyn->rs.line.stipple.factor; + } + } + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) { + uint32_t depth_stencil_dw[GENX(DEPTH_STENCIL_STATE_length)]; + + VkImageAspectFlags ds_aspects = 0; + if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + + struct vk_depth_stencil_state opt_ds = dyn->ds; + vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true); + + struct GENX(DEPTH_STENCIL_STATE) depth_stencil = { + .DoubleSidedStencilEnable = true, + + .StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff, + .StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff, + + .BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff, + .BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff, + + .DepthTestEnable = opt_ds.depth.test_enable, + .DepthBufferWriteEnable = opt_ds.depth.write_enable, + .DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op], + .StencilTestEnable = opt_ds.stencil.test_enable, + .StencilBufferWriteEnable = opt_ds.stencil.write_enable, + .StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail], + .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass], + .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail], + .StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare], + .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail], + .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass], + .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail], + .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare], + }; + GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil); + + struct anv_state ds_state = + anv_cmd_buffer_emit_dynamic(cmd_buffer, depth_stencil_dw, + sizeof(depth_stencil_dw), 64); + + anv_batch_emit(&cmd_buffer->batch, + GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), dsp) { + dsp.PointertoDEPTH_STENCIL_STATE = ds_state.offset; + } + } + + if (cmd_buffer->state.gfx.index_buffer && + ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_INDEX_BUFFER)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))) { + struct anv_buffer *buffer = cmd_buffer->state.gfx.index_buffer; + uint32_t offset = cmd_buffer->state.gfx.index_offset; + +#if GFX_VERx10 == 75 + anv_batch_emit(&cmd_buffer->batch, GFX75_3DSTATE_VF, vf) { + vf.IndexedDrawCutIndexEnable = dyn->ia.primitive_restart_enable; + vf.CutIndex = cmd_buffer->state.gfx.restart_index; + } +#endif + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) { +#if GFX_VERx10 != 75 + ib.CutIndexEnable = dyn->ia.primitive_restart_enable; +#endif + ib.IndexFormat = cmd_buffer->state.gfx.index_type; + ib.MOCS = anv_mocs(cmd_buffer->device, + buffer->address.bo, + ISL_SURF_USAGE_INDEX_BUFFER_BIT); + + ib.BufferStartingAddress = anv_address_add(buffer->address, offset); + ib.BufferEndingAddress = anv_address_add(buffer->address, + buffer->vk.size); + } + } + + /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders + * threads or if we have dirty dynamic primitive topology state and + * need to toggle 3DSTATE_WM::MultisampleRasterizationMode dynamically. + */ + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + VkPolygonMode dynamic_raster_mode = + genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, + dyn->ia.primitive_topology); + + uint32_t dwords[GENX(3DSTATE_WM_length)]; + struct GENX(3DSTATE_WM) wm = { + GENX(3DSTATE_WM_header), + + .ThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && + (pipeline->force_fragment_thread_dispatch || + !anv_cmd_buffer_all_color_write_masked(cmd_buffer)), + .MultisampleRasterizationMode = + genX(ms_rasterization_mode)(pipeline, + dynamic_raster_mode), + }; + GENX(3DSTATE_WM_pack)(NULL, dwords, &wm); + + anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx7.wm); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS)) { + const uint32_t samples = MAX2(1, cmd_buffer->state.gfx.samples); + const struct vk_sample_locations_state *sl = dyn->ms.sample_locations; + genX(emit_multisample)(&cmd_buffer->batch, samples, + sl->per_pixel == samples ? sl : NULL); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + const uint8_t color_writes = dyn->cb.color_write_enables; + + /* Blend states of each RT */ + uint32_t blend_dws[GENX(BLEND_STATE_length) + + MAX_RTS * GENX(BLEND_STATE_ENTRY_length)]; + uint32_t *dws = blend_dws; + memset(blend_dws, 0, sizeof(blend_dws)); + + /* Skip this part */ + dws += GENX(BLEND_STATE_length); + + for (uint32_t i = 0; i < MAX_RTS; i++) { + /* Disable anything above the current number of color attachments. */ + bool write_disabled = i >= cmd_buffer->state.gfx.color_att_count || + (color_writes & BITFIELD_BIT(i)) == 0; + struct GENX(BLEND_STATE_ENTRY) entry = { + .WriteDisableAlpha = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_A_BIT) == 0, + .WriteDisableRed = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_R_BIT) == 0, + .WriteDisableGreen = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_G_BIT) == 0, + .WriteDisableBlue = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_B_BIT) == 0, + .LogicOpFunction = genX(vk_to_intel_logic_op)[dyn->cb.logic_op], + }; + GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry); + dws += GENX(BLEND_STATE_ENTRY_length); + } + + uint32_t num_dwords = GENX(BLEND_STATE_length) + + GENX(BLEND_STATE_ENTRY_length) * MAX_RTS; + + struct anv_state blend_states = + anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws, + pipeline->gfx7.blend_state, num_dwords, 64); + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) { + bsp.BlendStatePointer = blend_states.offset; + } + } + + /* When we're done, there is no more dirty gfx state. */ + vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); + cmd_buffer->state.gfx.dirty = 0; +} + +void +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, + bool enable) +{ + /* The NP PMA fix doesn't exist on gfx7 */ +} diff --git a/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c b/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c new file mode 100644 index 00000000000..8972a0c73fd --- /dev/null +++ b/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c @@ -0,0 +1,706 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "anv_private.h" + +#include "genxml/gen_macros.h" +#include "genxml/genX_pack.h" + +void +genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable) +{ + if (cmd_buffer->state.pma_fix_enabled == enable) + return; + + cmd_buffer->state.pma_fix_enabled = enable; + + /* According to the Broadwell PIPE_CONTROL documentation, software should + * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set + * prior to the LRI. If stencil buffer writes are enabled, then a Render + * Cache Flush is also necessary. + * + * The Skylake docs say to use a depth stall rather than a command + * streamer stall. However, the hardware seems to violently disagree. + * A full command streamer stall seems to be needed in both cases. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DepthCacheFlushEnable = true; + pc.CommandStreamerStallEnable = true; + pc.RenderTargetCacheFlushEnable = true; +#if GFX_VER >= 12 + pc.TileCacheFlushEnable = true; + + /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must + * be set with any PIPE_CONTROL with Depth Flush Enable bit set. + */ + pc.DepthStallEnable = true; +#endif + } + +#if GFX_VER == 9 + + uint32_t cache_mode; + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0), + .STCPMAOptimizationEnable = enable, + .STCPMAOptimizationEnableMask = true); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(CACHE_MODE_0_num); + lri.DataDWord = cache_mode; + } + +#elif GFX_VER == 8 + + uint32_t cache_mode; + anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1), + .NPPMAFixEnable = enable, + .NPEarlyZFailsDisable = enable, + .NPPMAFixEnableMask = true, + .NPEarlyZFailsDisableMask = true); + anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = GENX(CACHE_MODE_1_num); + lri.DataDWord = cache_mode; + } + +#endif /* GFX_VER == 8 */ + + /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache + * Flush bits is often necessary. We do it regardless because it's easier. + * The render cache flush is also necessary if stencil writes are enabled. + * + * Again, the Skylake docs give a different set of flushes but the BDW + * flushes seem to work just as well. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.DepthStallEnable = true; + pc.DepthCacheFlushEnable = true; + pc.RenderTargetCacheFlushEnable = true; +#if GFX_VER >= 12 + pc.TileCacheFlushEnable = true; +#endif + } +} + +UNUSED static bool +want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer, + const struct vk_depth_stencil_state *ds) +{ + assert(GFX_VER == 8); + + /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE: + * + * SW must set this bit in order to enable this fix when following + * expression is TRUE. + * + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) && + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && + * (3DSTATE_DEPTH_BUFFER::HIZ Enable) && + * !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) && + * (3DSTATE_PS_EXTRA::PixelShaderValid) && + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && + * (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) && + * (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && + * 3DSTATE_WM::ForceKillPix != ForceOff && + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) + */ + + /* These are always true: + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) + */ + + /* We only enable the PMA fix if we know for certain that HiZ is enabled. + * If we don't know whether HiZ is enabled or not, we disable the PMA fix + * and there is no harm. + * + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable + */ + if (!cmd_buffer->state.hiz_enabled) + return false; + + /* 3DSTATE_PS_EXTRA::PixelShaderValid */ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) + return false; + + /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + if (wm_prog_data->early_fragment_tests) + return false; + + /* We never use anv_pipeline for HiZ ops so this is trivially true: + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + */ + + /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */ + if (!ds->depth.test_enable) + return false; + + /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) && + * 3DSTATE_WM::ForceKillPix != ForceOff && + * ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable && + * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) || + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE && + * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) || + * (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) + */ + return (pipeline->kill_pixel && (ds->depth.write_enable || + ds->stencil.write_enable)) || + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; +} + +UNUSED static bool +want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer, + const struct vk_depth_stencil_state *ds) +{ + if (GFX_VER > 9) + return false; + assert(GFX_VER == 9); + + /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable: + * + * Clearing this bit will force the STC cache to wait for pending + * retirement of pixels at the HZ-read stage and do the STC-test for + * Non-promoted, R-computed and Computed depth modes instead of + * postponing the STC-test to RCPFE. + * + * STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && + * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable + * + * STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE) + * + * COMP_STC_EN = STC_TEST_EN && + * 3DSTATE_PS_EXTRA::PixelShaderComputesStencil + * + * SW parses the pipeline states to generate the following logical + * signal indicating if PMA FIX can be enabled. + * + * STC_PMA_OPT = + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) && + * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable && + * !(3DSTATE_WM::EDSC_Mode == 2) && + * 3DSTATE_PS_EXTRA::PixelShaderValid && + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) && + * (COMP_STC_EN || STC_WRITE_EN) && + * ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_WM::ForceKillPix == ON || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) || + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)) + */ + + /* These are always true: + * 3DSTATE_WM::ForceThreadDispatch != 1 && + * !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) + */ + + /* We only enable the PMA fix if we know for certain that HiZ is enabled. + * If we don't know whether HiZ is enabled or not, we disable the PMA fix + * and there is no harm. + * + * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) && + * 3DSTATE_DEPTH_BUFFER::HIZ Enable + */ + if (!cmd_buffer->state.hiz_enabled) + return false; + + /* We can't possibly know if HiZ is enabled without the depth attachment */ + ASSERTED const struct anv_image_view *d_iview = + cmd_buffer->state.gfx.depth_att.iview; + assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ); + + /* 3DSTATE_PS_EXTRA::PixelShaderValid */ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) + return false; + + /* !(3DSTATE_WM::EDSC_Mode == 2) */ + const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); + if (wm_prog_data->early_fragment_tests) + return false; + + /* We never use anv_pipeline for HiZ ops so this is trivially true: + * !(3DSTATE_WM_HZ_OP::DepthBufferClear || + * 3DSTATE_WM_HZ_OP::DepthBufferResolve || + * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable || + * 3DSTATE_WM_HZ_OP::StencilBufferClear) + */ + + /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && + * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable + */ + const bool stc_test_en = ds->stencil.test_enable; + + /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE && + * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable && + * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE) + */ + const bool stc_write_en = ds->stencil.write_enable; + + /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */ + const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil; + + /* COMP_STC_EN || STC_WRITE_EN */ + if (!(comp_stc_en || stc_write_en)) + return false; + + /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || + * 3DSTATE_WM::ForceKillPix == ON || + * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || + * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || + * 3DSTATE_PS_BLEND::AlphaTestEnable || + * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) || + * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF) + */ + return pipeline->kill_pixel || + wm_prog_data->computed_depth_mode != PSCDEPTH_OFF; +} + +void +genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer) +{ + struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + const struct vk_dynamic_graphics_state *dyn = + &cmd_buffer->vk.dynamic_graphics_state; + +#if GFX_VER >= 11 + if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate && + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) + genX(emit_shading_rate)(&cmd_buffer->batch, pipeline, &dyn->fsr); +#endif /* GFX_VER >= 11 */ + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) { + uint32_t sf_dw[GENX(3DSTATE_SF_length)]; + struct GENX(3DSTATE_SF) sf = { + GENX(3DSTATE_SF_header), + }; +#if GFX_VER == 8 + if (cmd_buffer->device->info->platform == INTEL_PLATFORM_CHV) { + sf.CHVLineWidth = dyn->rs.line.width; + } else { + sf.LineWidth = dyn->rs.line.width; + } +#else + sf.LineWidth = dyn->rs.line.width, +#endif + GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf); + anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx8.sf); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) { + /* Take dynamic primitive topology in to account with + * 3DSTATE_RASTER::APIMode + * 3DSTATE_RASTER::DXMultisampleRasterizationEnable + * 3DSTATE_RASTER::AntialiasingEnable + */ + uint32_t api_mode = 0; + bool msaa_raster_enable = false; + + VkPolygonMode dynamic_raster_mode = + genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline, + dyn->ia.primitive_topology); + + genX(rasterization_mode)(dynamic_raster_mode, + pipeline->line_mode, dyn->rs.line.width, + &api_mode, &msaa_raster_enable); + + bool aa_enable = anv_rasterization_aa_mode(dynamic_raster_mode, + pipeline->line_mode); + + uint32_t raster_dw[GENX(3DSTATE_RASTER_length)]; + struct GENX(3DSTATE_RASTER) raster = { + GENX(3DSTATE_RASTER_header), + .APIMode = api_mode, + .DXMultisampleRasterizationEnable = msaa_raster_enable, + .AntialiasingEnable = aa_enable, + .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode], + .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face], + .GlobalDepthOffsetEnableSolid = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetEnableWireframe = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetEnablePoint = dyn->rs.depth_bias.enable, + .GlobalDepthOffsetConstant = dyn->rs.depth_bias.constant, + .GlobalDepthOffsetScale = dyn->rs.depth_bias.slope, + .GlobalDepthOffsetClamp = dyn->rs.depth_bias.clamp, + }; + GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster); + anv_batch_emit_merge(&cmd_buffer->batch, raster_dw, + pipeline->gfx8.raster); + } + + /* Stencil reference values moved from COLOR_CALC_STATE in gfx8 to + * 3DSTATE_WM_DEPTH_STENCIL in gfx9. That means the dirty bits gets split + * across different state packets for gfx8 and gfx9. We handle that by + * using a big old #if switch here. + */ +#if GFX_VER == 8 + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + GENX(COLOR_CALC_STATE_length) * 4, + 64); + struct GENX(COLOR_CALC_STATE) cc = { + .BlendConstantColorRed = dyn->cb.blend_constants[0], + .BlendConstantColorGreen = dyn->cb.blend_constants[1], + .BlendConstantColorBlue = dyn->cb.blend_constants[2], + .BlendConstantColorAlpha = dyn->cb.blend_constants[3], + .StencilReferenceValue = dyn->ds.stencil.front.reference & 0xff, + .BackfaceStencilReferenceValue = dyn->ds.stencil.back.reference & 0xff, + }; + GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) { + ccp.ColorCalcStatePointer = cc_state.offset; + ccp.ColorCalcStatePointerValid = true; + } + } + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) { + VkImageAspectFlags ds_aspects = 0; + if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + + struct vk_depth_stencil_state opt_ds = dyn->ds; + vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) { + ds.DoubleSidedStencilEnable = true; + + ds.StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff; + ds.StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff; + + ds.BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff; + ds.BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff; + + ds.DepthTestEnable = opt_ds.depth.test_enable; + ds.DepthBufferWriteEnable = opt_ds.depth.write_enable; + ds.DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]; + ds.StencilTestEnable = opt_ds.stencil.test_enable; + ds.StencilBufferWriteEnable = opt_ds.stencil.write_enable; + ds.StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]; + ds.StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]; + ds.StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]; + ds.StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]; + ds.BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]; + ds.BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]; + ds.BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]; + ds.BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]; + } + + const bool pma = want_depth_pma_fix(cmd_buffer, &opt_ds); + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, pma); + } +#else + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) { + struct anv_state cc_state = + anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, + GENX(COLOR_CALC_STATE_length) * 4, + 64); + struct GENX(COLOR_CALC_STATE) cc = { + .BlendConstantColorRed = dyn->cb.blend_constants[0], + .BlendConstantColorGreen = dyn->cb.blend_constants[1], + .BlendConstantColorBlue = dyn->cb.blend_constants[2], + .BlendConstantColorAlpha = dyn->cb.blend_constants[3], + }; + GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) { + ccp.ColorCalcStatePointer = cc_state.offset; + ccp.ColorCalcStatePointerValid = true; + } + } + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_RENDER_TARGETS)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) { + VkImageAspectFlags ds_aspects = 0; + if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT; + if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED) + ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT; + + struct vk_depth_stencil_state opt_ds = dyn->ds; + vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true); + + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) { + ds.DoubleSidedStencilEnable = true; + + ds.StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff; + ds.StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff; + + ds.BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff; + ds.BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff; + + ds.StencilReferenceValue = opt_ds.stencil.front.reference & 0xff; + ds.BackfaceStencilReferenceValue = opt_ds.stencil.back.reference & 0xff; + + ds.DepthTestEnable = opt_ds.depth.test_enable; + ds.DepthBufferWriteEnable = opt_ds.depth.write_enable; + ds.DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]; + ds.StencilTestEnable = opt_ds.stencil.test_enable; + ds.StencilBufferWriteEnable = opt_ds.stencil.write_enable; + ds.StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]; + ds.StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]; + ds.StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]; + ds.StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]; + ds.BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]; + ds.BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]; + ds.BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]; + ds.BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]; + } + + const bool pma = want_stencil_pma_fix(cmd_buffer, &opt_ds); + genX(cmd_buffer_enable_pma_fix)(cmd_buffer, pma); + } +#endif + +#if GFX_VER >= 12 + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { + db.DepthBoundsTestEnable = dyn->ds.depth.bounds_test.enable; + db.DepthBoundsTestMinValue = dyn->ds.depth.bounds_test.min; + db.DepthBoundsTestMaxValue = dyn->ds.depth.bounds_test.max; + } + } +#endif + + if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) { + ls.LineStipplePattern = dyn->rs.line.stipple.pattern; + ls.LineStippleInverseRepeatCount = + 1.0f / MAX2(1, dyn->rs.line.stipple.factor); + ls.LineStippleRepeatCount = dyn->rs.line.stipple.factor; + } + } + + if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE | + ANV_CMD_DIRTY_INDEX_BUFFER)) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) { +#if GFX_VERx10 >= 125 + vf.GeometryDistributionEnable = true; +#endif + vf.IndexedDrawCutIndexEnable = dyn->ia.primitive_restart_enable; + vf.CutIndex = cmd_buffer->state.gfx.restart_index; + } + } + + if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDEX_BUFFER) { + struct anv_buffer *buffer = cmd_buffer->state.gfx.index_buffer; + uint32_t offset = cmd_buffer->state.gfx.index_offset; + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) { + ib.IndexFormat = cmd_buffer->state.gfx.index_type; + ib.MOCS = anv_mocs(cmd_buffer->device, + buffer->address.bo, + ISL_SURF_USAGE_INDEX_BUFFER_BIT); +#if GFX_VER >= 12 + ib.L3BypassDisable = true; +#endif + ib.BufferStartingAddress = anv_address_add(buffer->address, offset); + ib.BufferSize = vk_buffer_range(&buffer->vk, offset, + VK_WHOLE_SIZE); + } + } + +#if GFX_VERx10 >= 125 + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) { + /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/ + vfg.DistributionMode = + anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT : + RR_FREE; + vfg.DistributionGranularity = BatchLevelGranularity; + /* Wa_14014890652 */ + if (intel_device_info_is_dg2(cmd_buffer->device->info)) + vfg.GranularityThresholdDisable = 1; + vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable; + /* 192 vertices for TRILIST_ADJ */ + vfg.ListNBatchSizeScale = 0; + /* Batch size of 384 vertices */ + vfg.List3BatchSizeScale = 2; + /* Batch size of 128 vertices */ + vfg.List2BatchSizeScale = 1; + /* Batch size of 128 vertices */ + vfg.List1BatchSizeScale = 2; + /* Batch size of 256 vertices for STRIP topologies */ + vfg.StripBatchSizeScale = 3; + /* 192 control points for PATCHLIST_3 */ + vfg.PatchBatchSizeScale = 1; + /* 192 control points for PATCHLIST_3 */ + vfg.PatchBatchSizeMultiplier = 31; + } + } +#endif + + if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations && + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS)) + genX(emit_sample_pattern)(&cmd_buffer->batch, dyn->ms.sample_locations); + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders + * threads. + */ + uint32_t wm_dwords[GENX(3DSTATE_WM_length)]; + struct GENX(3DSTATE_WM) wm = { + GENX(3DSTATE_WM_header), + + .ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && + (pipeline->force_fragment_thread_dispatch || + anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ? + ForceON : 0, + }; + GENX(3DSTATE_WM_pack)(NULL, wm_dwords, &wm); + + anv_batch_emit_merge(&cmd_buffer->batch, wm_dwords, pipeline->gfx8.wm); + } + + if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) || + BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) { + const uint8_t color_writes = dyn->cb.color_write_enables; + const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx; + bool has_writeable_rt = + anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) && + (color_writes & ((1u << state->color_att_count) - 1)) != 0; + + /* 3DSTATE_PS_BLEND to be consistent with the rest of the + * BLEND_STATE_ENTRY. + */ + uint32_t ps_blend_dwords[GENX(3DSTATE_PS_BLEND_length)]; + struct GENX(3DSTATE_PS_BLEND) ps_blend = { + GENX(3DSTATE_PS_BLEND_header), + .HasWriteableRT = has_writeable_rt, + }; + GENX(3DSTATE_PS_BLEND_pack)(NULL, ps_blend_dwords, &ps_blend); + anv_batch_emit_merge(&cmd_buffer->batch, ps_blend_dwords, + pipeline->gfx8.ps_blend); + + uint32_t blend_dws[GENX(BLEND_STATE_length) + + MAX_RTS * GENX(BLEND_STATE_ENTRY_length)]; + uint32_t *dws = blend_dws; + memset(blend_dws, 0, sizeof(blend_dws)); + + /* Skip this part */ + dws += GENX(BLEND_STATE_length); + + for (uint32_t i = 0; i < MAX_RTS; i++) { + /* Disable anything above the current number of color attachments. */ + bool write_disabled = i >= cmd_buffer->state.gfx.color_att_count || + (color_writes & BITFIELD_BIT(i)) == 0; + struct GENX(BLEND_STATE_ENTRY) entry = { + .WriteDisableAlpha = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_A_BIT) == 0, + .WriteDisableRed = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_R_BIT) == 0, + .WriteDisableGreen = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_G_BIT) == 0, + .WriteDisableBlue = write_disabled || + (pipeline->color_comp_writes[i] & + VK_COLOR_COMPONENT_B_BIT) == 0, + .LogicOpFunction = genX(vk_to_intel_logic_op)[dyn->cb.logic_op], + }; + GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry); + dws += GENX(BLEND_STATE_ENTRY_length); + } + + uint32_t num_dwords = GENX(BLEND_STATE_length) + + GENX(BLEND_STATE_ENTRY_length) * MAX_RTS; + + struct anv_state blend_states = + anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws, + pipeline->gfx8.blend_state, num_dwords, 64); + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) { + bsp.BlendStatePointer = blend_states.offset; + bsp.BlendStatePointerValid = true; + } + } + + /* When we're done, there is no more dirty gfx state. */ + vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state); + cmd_buffer->state.gfx.dirty = 0; +} diff --git a/src/intel/vulkan_hasvk/meson.build b/src/intel/vulkan_hasvk/meson.build new file mode 100644 index 00000000000..69e4341e1d9 --- /dev/null +++ b/src/intel/vulkan_hasvk/meson.build @@ -0,0 +1,265 @@ +# Copyright © 2017-2019 Intel Corporation + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +anv_hasvk_entrypoints = custom_target( + 'anv_hasvk_entrypoints', + input : [vk_entrypoints_gen, vk_api_xml], + output : ['anv_entrypoints.h', 'anv_entrypoints.c'], + command : [ + prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak', + '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'anv', + '--device-prefix', 'gfx7', '--device-prefix', 'gfx75', + '--device-prefix', 'gfx8', '--device-prefix', 'gfx9', + '--device-prefix', 'gfx11', '--device-prefix', 'gfx12', + '--device-prefix', 'gfx125', + ], + depend_files : vk_entrypoints_gen_depend_files, +) + +intel_hasvk_icd = custom_target( + 'intel_hasvk_icd', + input : [vk_icd_gen, vk_api_xml], + output : 'intel_hasvk_icd.@0@.json'.format(host_machine.cpu()), + command : [ + prog_python, '@INPUT0@', + '--api-version', '1.3', '--xml', '@INPUT1@', + '--lib-path', join_paths(get_option('prefix'), get_option('libdir'), + 'libvulkan_intel_hasvk.so'), + '--out', '@OUTPUT@', + ], + build_by_default : true, + install_dir : with_vulkan_icd_dir, + install : true, +) + +if meson.version().version_compare('>= 0.58') + _dev_icdname = 'intel_hasvk_devenv_icd.@0@.json'.format(host_machine.cpu()) + custom_target( + 'intel_hasvk_devenv_icd', + input : [vk_icd_gen, vk_api_xml], + output : _dev_icdname, + command : [ + prog_python, '@INPUT0@', + '--api-version', '1.3', '--xml', '@INPUT1@', + '--lib-path', meson.current_build_dir() / 'libvulkan_intel_hasvk.so', + '--out', '@OUTPUT@', + ], + build_by_default : true, + ) + + devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname) +endif + +libanv_per_hw_ver_libs = [] +anv_per_hw_ver_files = files( + 'genX_blorp_exec.c', + 'genX_cmd_buffer.c', + 'genX_gpu_memcpy.c', + 'genX_pipeline.c', + 'genX_query.c', + 'genX_state.c', +) +foreach g : [['70', ['gfx7_cmd_buffer.c']], ['75', ['gfx7_cmd_buffer.c']], + ['80', ['gfx8_cmd_buffer.c']], ['90', ['gfx8_cmd_buffer.c']], + ['110', ['gfx8_cmd_buffer.c']], ['120', ['gfx8_cmd_buffer.c']], + ['125', ['gfx8_cmd_buffer.c']]] + _gfx_ver = g[0] + libanv_per_hw_ver_libs += static_library( + 'anv_per_hw_ver@0@'.format(_gfx_ver), + [anv_per_hw_ver_files, g[1], anv_hasvk_entrypoints[0]], + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel, + ], + c_args : [ + no_override_init_args, c_sse2_args, + '-DGFX_VERx10=@0@'.format(_gfx_ver), + ], + gnu_symbol_visibility : 'hidden', + dependencies : [ + dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, + idep_vulkan_util_headers, idep_vulkan_wsi_headers, + idep_vulkan_runtime_headers, idep_intel_driver_ds_headers, + ], + ) +endforeach + +libanv_files = files( + 'anv_acceleration_structure.c', + 'anv_allocator.c', + 'anv_android.h', + 'anv_batch_chain.c', + 'anv_blorp.c', + 'anv_bo_sync.c', + 'anv_cmd_buffer.c', + 'anv_descriptor_set.c', + 'anv_device.c', + 'anv_formats.c', + 'anv_genX.h', + 'anv_image.c', + 'anv_measure.c', + 'anv_measure.h', + 'anv_nir.h', + 'anv_nir_add_base_work_group_id.c', + 'anv_nir_apply_pipeline_layout.c', + 'anv_nir_compute_push_layout.c', + 'anv_nir_lower_multiview.c', + 'anv_nir_lower_ubo_loads.c', + 'anv_nir_lower_ycbcr_textures.c', + 'anv_perf.c', + 'anv_pipeline.c', + 'anv_pipeline_cache.c', + 'anv_private.h', + 'anv_queue.c', + 'anv_util.c', + 'anv_utrace.c', + 'anv_wsi.c', +) + +anv_deps = [ + dep_libdrm, + dep_valgrind, + idep_genxml, + idep_nir_headers, + idep_vulkan_util_headers, + idep_vulkan_runtime_headers, + idep_vulkan_wsi_headers, +] +anv_flags = [ + no_override_init_args, + c_sse2_args, +] + +anv_cpp_flags = [] + +if with_platform_x11 + anv_deps += dep_xcb_dri3 +endif + +if with_platform_wayland + anv_deps += dep_wayland_client +endif + +if with_xlib_lease + anv_deps += [dep_xlib_xrandr] +endif + +if with_platform_android + libanv_files += files('anv_android.c') +else + libanv_files += files('anv_android_stubs.c') +endif + +anv_deps += idep_intel_driver_ds_headers + +libanv_hasvk_common = static_library( + 'anv_hasvk_common', + [ + libanv_files, anv_hasvk_entrypoints, sha1_h, + gen_xml_pack, + ], + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, + inc_util, + ], + c_args : anv_flags, + cpp_args : anv_cpp_flags, + gnu_symbol_visibility : 'hidden', + dependencies : anv_deps, +) + +libvulkan_intel_hasvk = shared_library( + 'vulkan_intel_hasvk', + [files('anv_gem.c'), anv_hasvk_entrypoints[0]], + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, + ], + link_whole : [libanv_hasvk_common, libanv_per_hw_ver_libs], + link_with : [ + libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf, + ], + dependencies : [ + dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common, + idep_nir, idep_genxml, idep_vulkan_util, idep_vulkan_wsi, + idep_vulkan_runtime, idep_mesautil, idep_xmlconfig, + idep_intel_driver_ds, + ], + c_args : anv_flags, + gnu_symbol_visibility : 'hidden', + link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections], + install : true, +) + +if with_symbols_check + test( + 'anv symbols check', + symbols_check, + args : [ + '--lib', libvulkan_intel_hasvk, + '--symbols-file', vulkan_icd_symbols, + symbols_check_args, + ], + suite : ['intel'], + ) +endif + +if with_tests + libvulkan_intel_hasvk_test = static_library( + 'vulkan_intel_hasvk_test', + [files('anv_gem_stubs.c'), anv_hasvk_entrypoints[0]], + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, + ], + link_whole : libanv_hasvk_common, + link_with : [ + libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev, + libisl, libblorp, libintel_perf, + ], + dependencies : [ + dep_thread, dep_dl, dep_m, anv_deps, + idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime, + idep_mesautil, + ], + c_args : anv_flags, + gnu_symbol_visibility : 'hidden', + ) + + foreach t : ['block_pool_no_free', 'block_pool_grow_first', + 'state_pool_no_free', 'state_pool_free_list_only', + 'state_pool', 'state_pool_padding'] + test( + 'anv_hasvk_@0@'.format(t), + executable( + t, + ['tests/@0@.c'.format(t), anv_hasvk_entrypoints[0]], + c_args : [ c_sse2_args ], + link_with : libvulkan_intel_hasvk_test, + dependencies : [ + dep_libdrm, dep_thread, dep_m, dep_valgrind, + idep_vulkan_util, idep_vulkan_wsi_headers, + idep_vulkan_runtime, idep_intel_driver_ds, + ], + include_directories : [ + inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, + ], + ), + suite : ['intel'], + ) + endforeach +endif diff --git a/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c b/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c new file mode 100644 index 00000000000..109275b07cc --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c @@ -0,0 +1,67 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" +#include "test_common.h" + +int main(void) +{ + struct anv_physical_device physical_device = { + .use_softpin = true, + }; + struct anv_device device = {}; + struct anv_block_pool pool; + + /* Create a pool with initial size smaller than the block allocated, so + * that it must grow in the first allocation. + */ + const uint32_t block_size = 16 * 1024; + const uint32_t initial_size = block_size / 2; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_block_pool_init(&pool, &device, "test", 4096, initial_size); + ASSERT(pool.size == initial_size); + + uint32_t padding; + int32_t offset = anv_block_pool_alloc(&pool, block_size, &padding); + + /* Pool will have grown at least space to fit the new allocation. */ + ASSERT(pool.size > initial_size); + ASSERT(pool.size >= initial_size + block_size); + + /* The whole initial size is considered padding and the allocation should be + * right next to it. + */ + ASSERT(padding == initial_size); + ASSERT(offset == initial_size); + + /* Use the memory to ensure it is valid. */ + void *map = anv_block_pool_map(&pool, offset, block_size); + memset(map, 22, block_size); + + anv_block_pool_finish(&pool); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} diff --git a/src/intel/vulkan_hasvk/tests/block_pool_no_free.c b/src/intel/vulkan_hasvk/tests/block_pool_no_free.c new file mode 100644 index 00000000000..e0e24dcc0c4 --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/block_pool_no_free.c @@ -0,0 +1,153 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "anv_private.h" +#include "test_common.h" + +#define NUM_THREADS 16 +#define BLOCKS_PER_THREAD 1024 +#define NUM_RUNS 64 + +struct job { + pthread_t thread; + unsigned id; + struct anv_block_pool *pool; + int32_t blocks[BLOCKS_PER_THREAD]; + int32_t back_blocks[BLOCKS_PER_THREAD]; +} jobs[NUM_THREADS]; + + +static void *alloc_blocks(void *_job) +{ + struct job *job = _job; + uint32_t job_id = job - jobs; + uint32_t block_size = 16 * ((job_id % 4) + 1); + int32_t block, *data; + + for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) { + block = anv_block_pool_alloc(job->pool, block_size, NULL); + data = anv_block_pool_map(job->pool, block, block_size); + *data = block; + ASSERT(block >= 0); + job->blocks[i] = block; + + block = anv_block_pool_alloc_back(job->pool, block_size); + data = anv_block_pool_map(job->pool, block, block_size); + *data = block; + ASSERT(block < 0); + job->back_blocks[i] = -block; + } + + for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) { + block = job->blocks[i]; + data = anv_block_pool_map(job->pool, block, block_size); + ASSERT(*data == block); + + block = -job->back_blocks[i]; + data = anv_block_pool_map(job->pool, block, block_size); + ASSERT(*data == block); + } + + return NULL; +} + +static void validate_monotonic(int32_t **blocks) +{ + /* A list of indices, one per thread */ + unsigned next[NUM_THREADS]; + memset(next, 0, sizeof(next)); + + int highest = -1; + while (true) { + /* First, we find which thread has the lowest next element */ + int32_t thread_min = INT32_MAX; + int min_thread_idx = -1; + for (unsigned i = 0; i < NUM_THREADS; i++) { + if (next[i] >= BLOCKS_PER_THREAD) + continue; + + if (thread_min > blocks[i][next[i]]) { + thread_min = blocks[i][next[i]]; + min_thread_idx = i; + } + } + + /* The only way this can happen is if all of the next[] values are at + * BLOCKS_PER_THREAD, in which case, we're done. + */ + if (thread_min == INT32_MAX) + break; + + /* That next element had better be higher than the previous highest */ + ASSERT(blocks[min_thread_idx][next[min_thread_idx]] > highest); + + highest = blocks[min_thread_idx][next[min_thread_idx]]; + next[min_thread_idx]++; + } +} + +static void run_test() +{ + struct anv_physical_device physical_device = { + .use_relocations = true, + }; + struct anv_device device = {}; + struct anv_block_pool pool; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_block_pool_init(&pool, &device, "test", 4096, 4096); + + for (unsigned i = 0; i < NUM_THREADS; i++) { + jobs[i].pool = &pool; + jobs[i].id = i; + pthread_create(&jobs[i].thread, NULL, alloc_blocks, &jobs[i]); + } + + for (unsigned i = 0; i < NUM_THREADS; i++) + pthread_join(jobs[i].thread, NULL); + + /* Validate that the block allocations were monotonic */ + int32_t *block_ptrs[NUM_THREADS]; + for (unsigned i = 0; i < NUM_THREADS; i++) + block_ptrs[i] = jobs[i].blocks; + validate_monotonic(block_ptrs); + + /* Validate that the back block allocations were monotonic */ + for (unsigned i = 0; i < NUM_THREADS; i++) + block_ptrs[i] = jobs[i].back_blocks; + validate_monotonic(block_ptrs); + + anv_block_pool_finish(&pool); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} + +int main(void) +{ + for (unsigned i = 0; i < NUM_RUNS; i++) + run_test(); +} diff --git a/src/intel/vulkan_hasvk/tests/state_pool.c b/src/intel/vulkan_hasvk/tests/state_pool.c new file mode 100644 index 00000000000..57cfa73d54e --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/state_pool.c @@ -0,0 +1,59 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "anv_private.h" +#include "test_common.h" + +#define NUM_THREADS 8 +#define STATES_PER_THREAD_LOG2 10 +#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2) +#define NUM_RUNS 64 + +#include "state_pool_test_helper.h" + +int main(void) +{ + struct anv_physical_device physical_device = { }; + struct anv_device device = {}; + struct anv_state_pool state_pool; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + + for (unsigned i = 0; i < NUM_RUNS; i++) { + anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 256); + + /* Grab one so a zero offset is impossible */ + anv_state_pool_alloc(&state_pool, 16, 16); + + run_state_pool_test(&state_pool); + + anv_state_pool_finish(&state_pool); + } + + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} diff --git a/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c b/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c new file mode 100644 index 00000000000..602346fedae --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c @@ -0,0 +1,68 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "anv_private.h" +#include "test_common.h" + +#define NUM_THREADS 8 +#define STATES_PER_THREAD_LOG2 12 +#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2) + +#include "state_pool_test_helper.h" + +int main(void) +{ + struct anv_physical_device physical_device = { }; + struct anv_device device = {}; + struct anv_state_pool state_pool; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096); + + /* Grab one so a zero offset is impossible */ + anv_state_pool_alloc(&state_pool, 16, 16); + + /* Grab and return enough states that the state pool test below won't + * actually ever resize anything. + */ + { + struct anv_state states[NUM_THREADS * STATES_PER_THREAD]; + for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) { + states[i] = anv_state_pool_alloc(&state_pool, 16, 16); + ASSERT(states[i].offset != 0); + } + + for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) + anv_state_pool_free(&state_pool, states[i]); + } + + run_state_pool_test(&state_pool); + + anv_state_pool_finish(&state_pool); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} diff --git a/src/intel/vulkan_hasvk/tests/state_pool_no_free.c b/src/intel/vulkan_hasvk/tests/state_pool_no_free.c new file mode 100644 index 00000000000..fe076830406 --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/state_pool_no_free.c @@ -0,0 +1,119 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "anv_private.h" +#include "test_common.h" + +#define NUM_THREADS 16 +#define STATES_PER_THREAD 1024 +#define NUM_RUNS 64 + +struct job { + pthread_t thread; + unsigned id; + struct anv_state_pool *pool; + uint32_t offsets[STATES_PER_THREAD]; +} jobs[NUM_THREADS]; + +pthread_barrier_t barrier; + +static void *alloc_states(void *_job) +{ + struct job *job = _job; + + pthread_barrier_wait(&barrier); + + for (unsigned i = 0; i < STATES_PER_THREAD; i++) { + struct anv_state state = anv_state_pool_alloc(job->pool, 16, 16); + job->offsets[i] = state.offset; + } + + return NULL; +} + +static void run_test() +{ + struct anv_physical_device physical_device = { }; + struct anv_device device = {}; + struct anv_state_pool state_pool; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 64); + + pthread_barrier_init(&barrier, NULL, NUM_THREADS); + + for (unsigned i = 0; i < NUM_THREADS; i++) { + jobs[i].pool = &state_pool; + jobs[i].id = i; + pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]); + } + + for (unsigned i = 0; i < NUM_THREADS; i++) + pthread_join(jobs[i].thread, NULL); + + /* A list of indices, one per thread */ + unsigned next[NUM_THREADS]; + memset(next, 0, sizeof(next)); + + int highest = -1; + while (true) { + /* First, we find which thread has the highest next element */ + int thread_max = -1; + int max_thread_idx = -1; + for (unsigned i = 0; i < NUM_THREADS; i++) { + if (next[i] >= STATES_PER_THREAD) + continue; + + if (thread_max < jobs[i].offsets[next[i]]) { + thread_max = jobs[i].offsets[next[i]]; + max_thread_idx = i; + } + } + + /* The only way this can happen is if all of the next[] values are at + * BLOCKS_PER_THREAD, in which case, we're done. + */ + if (thread_max == -1) + break; + + /* That next element had better be higher than the previous highest */ + ASSERT(jobs[max_thread_idx].offsets[next[max_thread_idx]] > highest); + + highest = jobs[max_thread_idx].offsets[next[max_thread_idx]]; + next[max_thread_idx]++; + } + + anv_state_pool_finish(&state_pool); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} + +int main(void) +{ + for (unsigned i = 0; i < NUM_RUNS; i++) + run_test(); +} diff --git a/src/intel/vulkan_hasvk/tests/state_pool_padding.c b/src/intel/vulkan_hasvk/tests/state_pool_padding.c new file mode 100644 index 00000000000..0ed72e1e502 --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/state_pool_padding.c @@ -0,0 +1,79 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "anv_private.h" +#include "test_common.h" + +int main(void) +{ + struct anv_physical_device physical_device = { + .use_softpin = true, + }; + struct anv_device device = {}; + struct anv_state_pool state_pool; + + anv_device_set_physical(&device, &physical_device); + pthread_mutex_init(&device.mutex, NULL); + anv_bo_cache_init(&device.bo_cache, &device); + anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096); + + /* Get the size of the underlying block_pool */ + struct anv_block_pool *bp = &state_pool.block_pool; + uint64_t pool_size = bp->size; + + /* Grab one so the pool has some initial usage */ + anv_state_pool_alloc(&state_pool, 16, 16); + + /* Grab a state that is the size of the initial allocation */ + struct anv_state state = anv_state_pool_alloc(&state_pool, pool_size, 16); + + /* The pool must have grown */ + ASSERT(bp->size > pool_size); + + /* And the state must have been allocated at the end of the original size */ + ASSERT(state.offset == pool_size); + + /* A new allocation that fits into the returned empty space should have an + * offset within the original pool size + */ + state = anv_state_pool_alloc(&state_pool, 4096, 16); + ASSERT(state.offset + state.alloc_size <= pool_size); + + /* We should be able to allocate pool->block_size'd chunks in the returned area + */ + int left_chunks = pool_size / 4096 - 2; + for (int i = 0; i < left_chunks; i++) { + state = anv_state_pool_alloc(&state_pool, 4096, 16); + ASSERT(state.offset + state.alloc_size <= pool_size); + } + + /* Now the next chunk to be allocated should make the pool grow again */ + pool_size = bp->size; + state = anv_state_pool_alloc(&state_pool, 4096, 16); + ASSERT(bp->size > pool_size); + ASSERT(state.offset == pool_size); + + anv_state_pool_finish(&state_pool); + anv_bo_cache_finish(&device.bo_cache); + pthread_mutex_destroy(&device.mutex); +} diff --git a/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h b/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h new file mode 100644 index 00000000000..f22a28ecc6f --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h @@ -0,0 +1,71 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +struct job { + struct anv_state_pool *pool; + unsigned id; + pthread_t thread; +} jobs[NUM_THREADS]; + +pthread_barrier_t barrier; + +static void *alloc_states(void *void_job) +{ + struct job *job = void_job; + + const unsigned chunk_size = 1 << (job->id % STATES_PER_THREAD_LOG2); + const unsigned num_chunks = STATES_PER_THREAD / chunk_size; + + struct anv_state states[chunk_size]; + + pthread_barrier_wait(&barrier); + + for (unsigned c = 0; c < num_chunks; c++) { + for (unsigned i = 0; i < chunk_size; i++) { + states[i] = anv_state_pool_alloc(job->pool, 16, 16); + memset(states[i].map, 139, 16); + ASSERT(states[i].offset != 0); + } + + for (unsigned i = 0; i < chunk_size; i++) + anv_state_pool_free(job->pool, states[i]); + } + + return NULL; +} + +static void run_state_pool_test(struct anv_state_pool *state_pool) +{ + pthread_barrier_init(&barrier, NULL, NUM_THREADS); + + for (unsigned i = 0; i < NUM_THREADS; i++) { + jobs[i].pool = state_pool; + jobs[i].id = i; + pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]); + } + + for (unsigned i = 0; i < NUM_THREADS; i++) + pthread_join(jobs[i].thread, NULL); +} diff --git a/src/intel/vulkan_hasvk/tests/test_common.h b/src/intel/vulkan_hasvk/tests/test_common.h new file mode 100644 index 00000000000..3f883e3bdcd --- /dev/null +++ b/src/intel/vulkan_hasvk/tests/test_common.h @@ -0,0 +1,34 @@ +/* + * Copyright © 2020 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include + +#define ASSERT(cond) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "%s:%d: Test assertion `%s` failed.\n", \ + __FILE__, __LINE__, # cond); \ + abort(); \ + } \ + } while (false)