From 50013ca9a57c42114044f593c981bbad8c405cc9 Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Wed, 3 Aug 2022 11:43:36 +0300
Subject: [PATCH] intel: add a hasvk vulkan driver

This new driver is a copy of the current Anv code, it will only load
on gfx7/8 platforms though.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Jason Ekstrand <jason.ekstrand@collabora.com>
Acked-by: Jason Ekstrand <jason@jlekstrand.net>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18208>
---
 meson.build                                   |    7 +-
 meson_options.txt                             |    2 +-
 src/intel/meson.build                         |    3 +
 src/intel/vulkan_hasvk/TODO                   |   13 +
 .../vulkan_hasvk/anv_acceleration_structure.c |  251 +
 src/intel/vulkan_hasvk/anv_allocator.c        | 2176 +++++
 src/intel/vulkan_hasvk/anv_android.c          |  792 ++
 src/intel/vulkan_hasvk/anv_android.h          |   57 +
 src/intel/vulkan_hasvk/anv_android_stubs.c    |   63 +
 src/intel/vulkan_hasvk/anv_batch_chain.c      | 2477 ++++++
 src/intel/vulkan_hasvk/anv_blorp.c            | 1995 +++++
 src/intel/vulkan_hasvk/anv_bo_sync.c          |  237 +
 src/intel/vulkan_hasvk/anv_cmd_buffer.c       | 1112 +++
 src/intel/vulkan_hasvk/anv_descriptor_set.c   | 2046 +++++
 src/intel/vulkan_hasvk/anv_device.c           | 4834 +++++++++++
 src/intel/vulkan_hasvk/anv_formats.c          | 1745 ++++
 src/intel/vulkan_hasvk/anv_gem.c              |  405 +
 src/intel/vulkan_hasvk/anv_gem_stubs.c        |  187 +
 src/intel/vulkan_hasvk/anv_genX.h             |  180 +
 src/intel/vulkan_hasvk/anv_image.c            | 2973 +++++++
 src/intel/vulkan_hasvk/anv_measure.c          |  516 ++
 src/intel/vulkan_hasvk/anv_measure.h          |   82 +
 src/intel/vulkan_hasvk/anv_nir.h              |   97 +
 .../anv_nir_add_base_work_group_id.c          |   63 +
 .../anv_nir_apply_pipeline_layout.c           | 1686 ++++
 .../anv_nir_compute_push_layout.c             |  290 +
 .../vulkan_hasvk/anv_nir_lower_multiview.c    |  324 +
 .../vulkan_hasvk/anv_nir_lower_ubo_loads.c    |  124 +
 .../anv_nir_lower_ycbcr_textures.c            |  349 +
 src/intel/vulkan_hasvk/anv_perf.c             |  488 ++
 src/intel/vulkan_hasvk/anv_pipeline.c         | 3300 ++++++++
 src/intel/vulkan_hasvk/anv_pipeline_cache.c   |  380 +
 src/intel/vulkan_hasvk/anv_private.h          | 4303 ++++++++++
 src/intel/vulkan_hasvk/anv_queue.c            |   75 +
 src/intel/vulkan_hasvk/anv_util.c             |   92 +
 src/intel/vulkan_hasvk/anv_utrace.c           |  346 +
 src/intel/vulkan_hasvk/anv_wsi.c              |  118 +
 src/intel/vulkan_hasvk/genX_blorp_exec.c      |  410 +
 src/intel/vulkan_hasvk/genX_cmd_buffer.c      | 7488 +++++++++++++++++
 src/intel/vulkan_hasvk/genX_gpu_memcpy.c      |  324 +
 src/intel/vulkan_hasvk/genX_pipeline.c        | 2563 ++++++
 src/intel/vulkan_hasvk/genX_query.c           | 1530 ++++
 src/intel/vulkan_hasvk/genX_state.c           | 1141 +++
 src/intel/vulkan_hasvk/gfx7_cmd_buffer.c      |  314 +
 src/intel/vulkan_hasvk/gfx8_cmd_buffer.c      |  706 ++
 src/intel/vulkan_hasvk/meson.build            |  265 +
 .../tests/block_pool_grow_first.c             |   67 +
 .../vulkan_hasvk/tests/block_pool_no_free.c   |  153 +
 src/intel/vulkan_hasvk/tests/state_pool.c     |   59 +
 .../tests/state_pool_free_list_only.c         |   68 +
 .../vulkan_hasvk/tests/state_pool_no_free.c   |  119 +
 .../vulkan_hasvk/tests/state_pool_padding.c   |   79 +
 .../tests/state_pool_test_helper.h            |   71 +
 src/intel/vulkan_hasvk/tests/test_common.h    |   34 +
 54 files changed, 49575 insertions(+), 4 deletions(-)
 create mode 100644 src/intel/vulkan_hasvk/TODO
 create mode 100644 src/intel/vulkan_hasvk/anv_acceleration_structure.c
 create mode 100644 src/intel/vulkan_hasvk/anv_allocator.c
 create mode 100644 src/intel/vulkan_hasvk/anv_android.c
 create mode 100644 src/intel/vulkan_hasvk/anv_android.h
 create mode 100644 src/intel/vulkan_hasvk/anv_android_stubs.c
 create mode 100644 src/intel/vulkan_hasvk/anv_batch_chain.c
 create mode 100644 src/intel/vulkan_hasvk/anv_blorp.c
 create mode 100644 src/intel/vulkan_hasvk/anv_bo_sync.c
 create mode 100644 src/intel/vulkan_hasvk/anv_cmd_buffer.c
 create mode 100644 src/intel/vulkan_hasvk/anv_descriptor_set.c
 create mode 100644 src/intel/vulkan_hasvk/anv_device.c
 create mode 100644 src/intel/vulkan_hasvk/anv_formats.c
 create mode 100644 src/intel/vulkan_hasvk/anv_gem.c
 create mode 100644 src/intel/vulkan_hasvk/anv_gem_stubs.c
 create mode 100644 src/intel/vulkan_hasvk/anv_genX.h
 create mode 100644 src/intel/vulkan_hasvk/anv_image.c
 create mode 100644 src/intel/vulkan_hasvk/anv_measure.c
 create mode 100644 src/intel/vulkan_hasvk/anv_measure.h
 create mode 100644 src/intel/vulkan_hasvk/anv_nir.h
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_multiview.c
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c
 create mode 100644 src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c
 create mode 100644 src/intel/vulkan_hasvk/anv_perf.c
 create mode 100644 src/intel/vulkan_hasvk/anv_pipeline.c
 create mode 100644 src/intel/vulkan_hasvk/anv_pipeline_cache.c
 create mode 100644 src/intel/vulkan_hasvk/anv_private.h
 create mode 100644 src/intel/vulkan_hasvk/anv_queue.c
 create mode 100644 src/intel/vulkan_hasvk/anv_util.c
 create mode 100644 src/intel/vulkan_hasvk/anv_utrace.c
 create mode 100644 src/intel/vulkan_hasvk/anv_wsi.c
 create mode 100644 src/intel/vulkan_hasvk/genX_blorp_exec.c
 create mode 100644 src/intel/vulkan_hasvk/genX_cmd_buffer.c
 create mode 100644 src/intel/vulkan_hasvk/genX_gpu_memcpy.c
 create mode 100644 src/intel/vulkan_hasvk/genX_pipeline.c
 create mode 100644 src/intel/vulkan_hasvk/genX_query.c
 create mode 100644 src/intel/vulkan_hasvk/genX_state.c
 create mode 100644 src/intel/vulkan_hasvk/gfx7_cmd_buffer.c
 create mode 100644 src/intel/vulkan_hasvk/gfx8_cmd_buffer.c
 create mode 100644 src/intel/vulkan_hasvk/meson.build
 create mode 100644 src/intel/vulkan_hasvk/tests/block_pool_grow_first.c
 create mode 100644 src/intel/vulkan_hasvk/tests/block_pool_no_free.c
 create mode 100644 src/intel/vulkan_hasvk/tests/state_pool.c
 create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c
 create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_no_free.c
 create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_padding.c
 create mode 100644 src/intel/vulkan_hasvk/tests/state_pool_test_helper.h
 create mode 100644 src/intel/vulkan_hasvk/tests/test_common.h

diff --git a/meson.build b/meson.build
index bec326c462f..1f310a24537 100644
--- a/meson.build
+++ b/meson.build
@@ -250,7 +250,7 @@ _vulkan_drivers = get_option('vulkan-drivers')
 if _vulkan_drivers.contains('auto')
   if system_has_kms_drm
     if host_machine.cpu_family().startswith('x86')
-      _vulkan_drivers = ['amd', 'intel', 'swrast']
+      _vulkan_drivers = ['amd', 'intel', 'intel_hasvk', 'swrast']
     elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
       _vulkan_drivers = ['swrast']
     elif ['mips', 'mips64', 'riscv32', 'riscv64'].contains(host_machine.cpu_family())
@@ -269,6 +269,7 @@ if _vulkan_drivers.contains('auto')
 endif
 
 with_intel_vk = _vulkan_drivers.contains('intel')
+with_intel_hasvk = _vulkan_drivers.contains('intel_hasvk')
 with_amd_vk = _vulkan_drivers.contains('amd')
 with_freedreno_vk = _vulkan_drivers.contains('freedreno')
 with_panfrost_vk = _vulkan_drivers.contains('panfrost')
@@ -283,7 +284,7 @@ with_microsoft_vk = _vulkan_drivers.contains('microsoft-experimental')
 with_any_vk = _vulkan_drivers.length() != 0
 
 with_any_broadcom = with_gallium_vc4 or with_gallium_v3d or with_broadcom_vk
-with_any_intel = with_intel_vk or with_gallium_iris or with_gallium_crocus or with_intel_tools
+with_any_intel = with_intel_vk or with_intel_hasvk or with_gallium_iris or with_gallium_crocus or with_intel_tools
 
 if with_swrast_vk and not with_gallium_softpipe
   error('swrast vulkan requires gallium swrast')
@@ -1549,7 +1550,7 @@ endif
 
 if cc.has_function('dl_iterate_phdr')
   pre_args += '-DHAVE_DL_ITERATE_PHDR'
-elif with_intel_vk
+elif with_intel_vk or with_intel_hasvk
   error('Intel "Anvil" Vulkan driver requires the dl_iterate_phdr function')
 endif
 
diff --git a/meson_options.txt b/meson_options.txt
index 2a5d9fddf6b..283a02bc934 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -198,7 +198,7 @@ option(
   'vulkan-drivers',
   type : 'array',
   value : ['auto'],
-  choices : ['auto', 'amd', 'broadcom', 'freedreno', 'imagination-experimental', 'intel', 'microsoft-experimental', 'panfrost', 'swrast', 'virtio-experimental'],
+  choices : ['auto', 'amd', 'broadcom', 'freedreno', 'imagination-experimental', 'intel', 'intel_hasvk', 'microsoft-experimental', 'panfrost', 'swrast', 'virtio-experimental'],
   description : 'List of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
diff --git a/src/intel/meson.build b/src/intel/meson.build
index 5d177553d89..ad77e11e3bb 100644
--- a/src/intel/meson.build
+++ b/src/intel/meson.build
@@ -38,3 +38,6 @@ endif
 if with_intel_vk
   subdir('vulkan')
 endif
+if with_intel_hasvk
+  subdir('vulkan_hasvk')
+endif
diff --git a/src/intel/vulkan_hasvk/TODO b/src/intel/vulkan_hasvk/TODO
new file mode 100644
index 00000000000..4c41e251888
--- /dev/null
+++ b/src/intel/vulkan_hasvk/TODO
@@ -0,0 +1,13 @@
+Intel Vulkan ToDo
+=================
+
+Missing Features:
+ - Investigate CTS failures on HSW
+ - Sparse memory
+
+Performance:
+ - Multi-{sampled/gfx8,LOD} HiZ
+ - MSAA fast clears
+ - Pushing pieces of UBOs?
+ - Enable guardband clipping
+ - Use soft-pin to avoid relocations
diff --git a/src/intel/vulkan_hasvk/anv_acceleration_structure.c b/src/intel/vulkan_hasvk/anv_acceleration_structure.c
new file mode 100644
index 00000000000..f003772e9c1
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_acceleration_structure.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+void
+anv_GetAccelerationStructureBuildSizesKHR(
+    VkDevice                                    device,
+    VkAccelerationStructureBuildTypeKHR         buildType,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
+    const uint32_t*                             pMaxPrimitiveCounts,
+    VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
+{
+   assert(pSizeInfo->sType ==
+          VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
+
+   pSizeInfo->accelerationStructureSize = 0; /* TODO */
+
+   uint64_t cpu_build_scratch_size = 0; /* TODO */
+   uint64_t cpu_update_scratch_size = cpu_build_scratch_size;
+
+   uint64_t gpu_build_scratch_size = 0; /* TODO */
+   uint64_t gpu_update_scratch_size = gpu_build_scratch_size;
+
+   switch (buildType) {
+   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_KHR:
+      pSizeInfo->buildScratchSize = cpu_build_scratch_size;
+      pSizeInfo->updateScratchSize = cpu_update_scratch_size;
+      break;
+
+   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR:
+      pSizeInfo->buildScratchSize = gpu_build_scratch_size;
+      pSizeInfo->updateScratchSize = gpu_update_scratch_size;
+      break;
+
+   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_OR_DEVICE_KHR:
+      pSizeInfo->buildScratchSize = MAX2(cpu_build_scratch_size,
+                                         gpu_build_scratch_size);
+      pSizeInfo->updateScratchSize = MAX2(cpu_update_scratch_size,
+                                          gpu_update_scratch_size);
+      break;
+
+   default:
+      unreachable("Invalid acceleration structure build type");
+   }
+}
+
+VkResult
+anv_CreateAccelerationStructureKHR(
+    VkDevice                                    _device,
+    const VkAccelerationStructureCreateInfoKHR* pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkAccelerationStructureKHR*                 pAccelerationStructure)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
+   struct anv_acceleration_structure *accel;
+
+   accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (accel == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   vk_object_base_init(&device->vk, &accel->base,
+                       VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
+
+   accel->size = pCreateInfo->size;
+   accel->address = anv_address_add(buffer->address, pCreateInfo->offset);
+
+   *pAccelerationStructure = anv_acceleration_structure_to_handle(accel);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyAccelerationStructureKHR(
+    VkDevice                                    _device,
+    VkAccelerationStructureKHR                  accelerationStructure,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure);
+
+   if (!accel)
+      return;
+
+   vk_object_base_finish(&accel->base);
+   vk_free2(&device->vk.alloc, pAllocator, accel);
+}
+
+VkDeviceAddress
+anv_GetAccelerationStructureDeviceAddressKHR(
+    VkDevice                                    device,
+    const VkAccelerationStructureDeviceAddressInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_acceleration_structure, accel,
+                   pInfo->accelerationStructure);
+
+   assert(!anv_address_is_null(accel->address));
+   assert(anv_bo_is_pinned(accel->address.bo));
+
+   return anv_address_physical(accel->address);
+}
+
+void
+anv_GetDeviceAccelerationStructureCompatibilityKHR(
+    VkDevice                                    device,
+    const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
+    VkAccelerationStructureCompatibilityKHR*    pCompatibility)
+{
+   unreachable("Unimplemented");
+}
+
+VkResult
+anv_BuildAccelerationStructuresKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+anv_CopyAccelerationStructureKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyAccelerationStructureInfoKHR*   pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+anv_CopyAccelerationStructureToMemoryKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+anv_CopyMemoryToAccelerationStructureKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+anv_WriteAccelerationStructuresPropertiesKHR(
+    VkDevice                                    _device,
+    uint32_t                                    accelerationStructureCount,
+    const VkAccelerationStructureKHR*           pAccelerationStructures,
+    VkQueryType                                 queryType,
+    size_t                                      dataSize,
+    void*                                       pData,
+    size_t                                      stride)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+void
+anv_CmdBuildAccelerationStructuresKHR(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+   unreachable("Unimplemented");
+}
+
+void
+anv_CmdBuildAccelerationStructuresIndirectKHR(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkDeviceAddress*                      pIndirectDeviceAddresses,
+    const uint32_t*                             pIndirectStrides,
+    const uint32_t* const*                      ppMaxPrimitiveCounts)
+{
+   unreachable("Unimplemented");
+}
+
+void
+anv_CmdCopyAccelerationStructureKHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyAccelerationStructureInfoKHR*   pInfo)
+{
+   unreachable("Unimplemented");
+}
+
+void
+anv_CmdCopyAccelerationStructureToMemoryKHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+   unreachable("Unimplemented");
+}
+
+void
+anv_CmdCopyMemoryToAccelerationStructureKHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+   unreachable("Unimplemented");
+}
+
+void
+anv_CmdWriteAccelerationStructuresPropertiesKHR(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    accelerationStructureCount,
+    const VkAccelerationStructureKHR*           pAccelerationStructures,
+    VkQueryType                                 queryType,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery)
+{
+   unreachable("Unimplemented");
+}
diff --git a/src/intel/vulkan_hasvk/anv_allocator.c b/src/intel/vulkan_hasvk/anv_allocator.c
new file mode 100644
index 00000000000..ce64811b178
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_allocator.c
@@ -0,0 +1,2176 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <assert.h>
+#include <sys/mman.h>
+
+#include "anv_private.h"
+
+#include "common/intel_aux_map.h"
+#include "util/anon_file.h"
+#include "util/futex.h"
+
+#ifdef HAVE_VALGRIND
+#define VG_NOACCESS_READ(__ptr) ({                       \
+   VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \
+   __typeof(*(__ptr)) __val = *(__ptr);                  \
+   VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\
+   __val;                                                \
+})
+#define VG_NOACCESS_WRITE(__ptr, __val) ({                  \
+   VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr)));  \
+   *(__ptr) = (__val);                                      \
+   VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));   \
+})
+#else
+#define VG_NOACCESS_READ(__ptr) (*(__ptr))
+#define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val))
+#endif
+
+#ifndef MAP_POPULATE
+#define MAP_POPULATE 0
+#endif
+
+/* Design goals:
+ *
+ *  - Lock free (except when resizing underlying bos)
+ *
+ *  - Constant time allocation with typically only one atomic
+ *
+ *  - Multiple allocation sizes without fragmentation
+ *
+ *  - Can grow while keeping addresses and offset of contents stable
+ *
+ *  - All allocations within one bo so we can point one of the
+ *    STATE_BASE_ADDRESS pointers at it.
+ *
+ * The overall design is a two-level allocator: top level is a fixed size, big
+ * block (8k) allocator, which operates out of a bo.  Allocation is done by
+ * either pulling a block from the free list or growing the used range of the
+ * bo.  Growing the range may run out of space in the bo which we then need to
+ * grow.  Growing the bo is tricky in a multi-threaded, lockless environment:
+ * we need to keep all pointers and contents in the old map valid.  GEM bos in
+ * general can't grow, but we use a trick: we create a memfd and use ftruncate
+ * to grow it as necessary.  We mmap the new size and then create a gem bo for
+ * it using the new gem userptr ioctl.  Without heavy-handed locking around
+ * our allocation fast-path, there isn't really a way to munmap the old mmap,
+ * so we just keep it around until garbage collection time.  While the block
+ * allocator is lockless for normal operations, we block other threads trying
+ * to allocate while we're growing the map.  It shouldn't happen often, and
+ * growing is fast anyway.
+ *
+ * At the next level we can use various sub-allocators.  The state pool is a
+ * pool of smaller, fixed size objects, which operates much like the block
+ * pool.  It uses a free list for freeing objects, but when it runs out of
+ * space it just allocates a new block from the block pool.  This allocator is
+ * intended for longer lived state objects such as SURFACE_STATE and most
+ * other persistent state objects in the API.  We may need to track more info
+ * with these object and a pointer back to the CPU object (eg VkImage).  In
+ * those cases we just allocate a slightly bigger object and put the extra
+ * state after the GPU state object.
+ *
+ * The state stream allocator works similar to how the i965 DRI driver streams
+ * all its state.  Even with Vulkan, we need to emit transient state (whether
+ * surface state base or dynamic state base), and for that we can just get a
+ * block and fill it up.  These cases are local to a command buffer and the
+ * sub-allocator need not be thread safe.  The streaming allocator gets a new
+ * block when it runs out of space and chains them together so they can be
+ * easily freed.
+ */
+
+/* Allocations are always at least 64 byte aligned, so 1 is an invalid value.
+ * We use it to indicate the free list is empty. */
+#define EMPTY UINT32_MAX
+
+/* On FreeBSD PAGE_SIZE is already defined in
+ * /usr/include/machine/param.h that is indirectly
+ * included here.
+ */
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+struct anv_mmap_cleanup {
+   void *map;
+   size_t size;
+};
+
+static inline uint32_t
+ilog2_round_up(uint32_t value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+static inline uint32_t
+round_to_power_of_two(uint32_t value)
+{
+   return 1 << ilog2_round_up(value);
+}
+
+struct anv_state_table_cleanup {
+   void *map;
+   size_t size;
+};
+
+#define ANV_STATE_TABLE_CLEANUP_INIT ((struct anv_state_table_cleanup){0})
+#define ANV_STATE_ENTRY_SIZE (sizeof(struct anv_free_entry))
+
+static VkResult
+anv_state_table_expand_range(struct anv_state_table *table, uint32_t size);
+
+VkResult
+anv_state_table_init(struct anv_state_table *table,
+                    struct anv_device *device,
+                    uint32_t initial_entries)
+{
+   VkResult result;
+
+   table->device = device;
+
+   /* Just make it 2GB up-front.  The Linux kernel won't actually back it
+    * with pages until we either map and fault on one of them or we use
+    * userptr and send a chunk of it off to the GPU.
+    */
+   table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
+   if (table->fd == -1)
+      return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+
+   if (!u_vector_init(&table->cleanups, 8,
+                      sizeof(struct anv_state_table_cleanup))) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_fd;
+   }
+
+   table->state.next = 0;
+   table->state.end = 0;
+   table->size = 0;
+
+   uint32_t initial_size = initial_entries * ANV_STATE_ENTRY_SIZE;
+   result = anv_state_table_expand_range(table, initial_size);
+   if (result != VK_SUCCESS)
+      goto fail_cleanups;
+
+   return VK_SUCCESS;
+
+ fail_cleanups:
+   u_vector_finish(&table->cleanups);
+ fail_fd:
+   close(table->fd);
+
+   return result;
+}
+
+static VkResult
+anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
+{
+   void *map;
+   struct anv_state_table_cleanup *cleanup;
+
+   /* Assert that we only ever grow the pool */
+   assert(size >= table->state.end);
+
+   /* Make sure that we don't go outside the bounds of the memfd */
+   if (size > BLOCK_POOL_MEMFD_SIZE)
+      return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   cleanup = u_vector_add(&table->cleanups);
+   if (!cleanup)
+      return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
+
+   /* Just leak the old map until we destroy the pool.  We can't munmap it
+    * without races or imposing locking on the block allocate fast path. On
+    * the whole the leaked maps adds up to less than the size of the
+    * current map.  MAP_POPULATE seems like the right thing to do, but we
+    * should try to get some numbers.
+    */
+   map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+              MAP_SHARED | MAP_POPULATE, table->fd, 0);
+   if (map == MAP_FAILED) {
+      return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "mmap failed: %m");
+   }
+
+   cleanup->map = map;
+   cleanup->size = size;
+
+   table->map = map;
+   table->size = size;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_state_table_grow(struct anv_state_table *table)
+{
+   VkResult result = VK_SUCCESS;
+
+   uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE,
+                             PAGE_SIZE);
+   uint32_t old_size = table->size;
+
+   /* The block pool is always initialized to a nonzero size and this function
+    * is always called after initialization.
+    */
+   assert(old_size > 0);
+
+   uint32_t required = MAX2(used, old_size);
+   if (used * 2 <= required) {
+      /* If we're in this case then this isn't the firsta allocation and we
+       * already have enough space on both sides to hold double what we
+       * have allocated.  There's nothing for us to do.
+       */
+      goto done;
+   }
+
+   uint32_t size = old_size * 2;
+   while (size < required)
+      size *= 2;
+
+   assert(size > table->size);
+
+   result = anv_state_table_expand_range(table, size);
+
+ done:
+   return result;
+}
+
+void
+anv_state_table_finish(struct anv_state_table *table)
+{
+   struct anv_state_table_cleanup *cleanup;
+
+   u_vector_foreach(cleanup, &table->cleanups) {
+      if (cleanup->map)
+         munmap(cleanup->map, cleanup->size);
+   }
+
+   u_vector_finish(&table->cleanups);
+
+   close(table->fd);
+}
+
+VkResult
+anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
+                    uint32_t count)
+{
+   struct anv_block_state state, old, new;
+   VkResult result;
+
+   assert(idx);
+
+   while(1) {
+      state.u64 = __sync_fetch_and_add(&table->state.u64, count);
+      if (state.next + count <= state.end) {
+         assert(table->map);
+         struct anv_free_entry *entry = &table->map[state.next];
+         for (int i = 0; i < count; i++) {
+            entry[i].state.idx = state.next + i;
+         }
+         *idx = state.next;
+         return VK_SUCCESS;
+      } else if (state.next <= state.end) {
+         /* We allocated the first block outside the pool so we have to grow
+          * the pool.  pool_state->next acts a mutex: threads who try to
+          * allocate now will get block indexes above the current limit and
+          * hit futex_wait below.
+          */
+         new.next = state.next + count;
+         do {
+            result = anv_state_table_grow(table);
+            if (result != VK_SUCCESS)
+               return result;
+            new.end = table->size / ANV_STATE_ENTRY_SIZE;
+         } while (new.end < new.next);
+
+         old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
+         if (old.next != state.next)
+            futex_wake(&table->state.end, INT_MAX);
+      } else {
+         futex_wait(&table->state.end, state.end, NULL);
+         continue;
+      }
+   }
+}
+
+void
+anv_free_list_push(union anv_free_list *list,
+                   struct anv_state_table *table,
+                   uint32_t first, uint32_t count)
+{
+   union anv_free_list current, old, new;
+   uint32_t last = first;
+
+   for (uint32_t i = 1; i < count; i++, last++)
+      table->map[last].next = last + 1;
+
+   old.u64 = list->u64;
+   do {
+      current = old;
+      table->map[last].next = current.offset;
+      new.offset = first;
+      new.count = current.count + 1;
+      old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
+   } while (old.u64 != current.u64);
+}
+
+struct anv_state *
+anv_free_list_pop(union anv_free_list *list,
+                  struct anv_state_table *table)
+{
+   union anv_free_list current, new, old;
+
+   current.u64 = list->u64;
+   while (current.offset != EMPTY) {
+      __sync_synchronize();
+      new.offset = table->map[current.offset].next;
+      new.count = current.count + 1;
+      old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64);
+      if (old.u64 == current.u64) {
+         struct anv_free_entry *entry = &table->map[current.offset];
+         return &entry->state;
+      }
+      current = old;
+   }
+
+   return NULL;
+}
+
+static VkResult
+anv_block_pool_expand_range(struct anv_block_pool *pool,
+                            uint32_t center_bo_offset, uint32_t size);
+
+VkResult
+anv_block_pool_init(struct anv_block_pool *pool,
+                    struct anv_device *device,
+                    const char *name,
+                    uint64_t start_address,
+                    uint32_t initial_size)
+{
+   VkResult result;
+
+   if (device->info->verx10 >= 125) {
+      /* Make sure VMA addresses are 2MiB aligned for the block pool */
+      assert(anv_is_aligned(start_address, 2 * 1024 * 1024));
+      assert(anv_is_aligned(initial_size, 2 * 1024 * 1024));
+   }
+
+   pool->name = name;
+   pool->device = device;
+   pool->use_relocations = anv_use_relocations(device->physical);
+   pool->nbos = 0;
+   pool->size = 0;
+   pool->center_bo_offset = 0;
+   pool->start_address = intel_canonical_address(start_address);
+   pool->map = NULL;
+
+   if (!pool->use_relocations) {
+      pool->bo = NULL;
+      pool->fd = -1;
+   } else {
+      /* Just make it 2GB up-front.  The Linux kernel won't actually back it
+       * with pages until we either map and fault on one of them or we use
+       * userptr and send a chunk of it off to the GPU.
+       */
+      pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool");
+      if (pool->fd == -1)
+         return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+
+      pool->wrapper_bo = (struct anv_bo) {
+         .refcount = 1,
+         .offset = -1,
+         .is_wrapper = true,
+      };
+      pool->bo = &pool->wrapper_bo;
+   }
+
+   if (!u_vector_init(&pool->mmap_cleanups, 8,
+                      sizeof(struct anv_mmap_cleanup))) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_fd;
+   }
+
+   pool->state.next = 0;
+   pool->state.end = 0;
+   pool->back_state.next = 0;
+   pool->back_state.end = 0;
+
+   result = anv_block_pool_expand_range(pool, 0, initial_size);
+   if (result != VK_SUCCESS)
+      goto fail_mmap_cleanups;
+
+   /* Make the entire pool available in the front of the pool.  If back
+    * allocation needs to use this space, the "ends" will be re-arranged.
+    */
+   pool->state.end = pool->size;
+
+   return VK_SUCCESS;
+
+ fail_mmap_cleanups:
+   u_vector_finish(&pool->mmap_cleanups);
+ fail_fd:
+   if (pool->fd >= 0)
+      close(pool->fd);
+
+   return result;
+}
+
+void
+anv_block_pool_finish(struct anv_block_pool *pool)
+{
+   anv_block_pool_foreach_bo(bo, pool) {
+      assert(bo->refcount == 1);
+      anv_device_release_bo(pool->device, bo);
+   }
+
+   struct anv_mmap_cleanup *cleanup;
+   u_vector_foreach(cleanup, &pool->mmap_cleanups)
+      munmap(cleanup->map, cleanup->size);
+   u_vector_finish(&pool->mmap_cleanups);
+
+   if (pool->fd >= 0)
+      close(pool->fd);
+}
+
+static VkResult
+anv_block_pool_expand_range(struct anv_block_pool *pool,
+                            uint32_t center_bo_offset, uint32_t size)
+{
+   /* Assert that we only ever grow the pool */
+   assert(center_bo_offset >= pool->back_state.end);
+   assert(size - center_bo_offset >= pool->state.end);
+
+   /* Assert that we don't go outside the bounds of the memfd */
+   assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);
+   assert(!pool->use_relocations ||
+          size - center_bo_offset <=
+          BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);
+
+   /* For state pool BOs we have to be a bit careful about where we place them
+    * in the GTT.  There are two documented workarounds for state base address
+    * placement : Wa32bitGeneralStateOffset and Wa32bitInstructionBaseOffset
+    * which state that those two base addresses do not support 48-bit
+    * addresses and need to be placed in the bottom 32-bit range.
+    * Unfortunately, this is not quite accurate.
+    *
+    * The real problem is that we always set the size of our state pools in
+    * STATE_BASE_ADDRESS to 0xfffff (the maximum) even though the BO is most
+    * likely significantly smaller.  We do this because we do not no at the
+    * time we emit STATE_BASE_ADDRESS whether or not we will need to expand
+    * the pool during command buffer building so we don't actually have a
+    * valid final size.  If the address + size, as seen by STATE_BASE_ADDRESS
+    * overflows 48 bits, the GPU appears to treat all accesses to the buffer
+    * as being out of bounds and returns zero.  For dynamic state, this
+    * usually just leads to rendering corruptions, but shaders that are all
+    * zero hang the GPU immediately.
+    *
+    * The easiest solution to do is exactly what the bogus workarounds say to
+    * do: restrict these buffers to 32-bit addresses.  We could also pin the
+    * BO to some particular location of our choosing, but that's significantly
+    * more work than just not setting a flag.  So, we explicitly DO NOT set
+    * the EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and the kernel does all of the
+    * hard work for us.  When using softpin, we're in control and the fixed
+    * addresses we choose are fine for base addresses.
+    */
+   enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE;
+   if (pool->use_relocations)
+      bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;
+
+   if (!pool->use_relocations) {
+      uint32_t new_bo_size = size - pool->size;
+      struct anv_bo *new_bo;
+      assert(center_bo_offset == 0);
+      VkResult result = anv_device_alloc_bo(pool->device,
+                                            pool->name,
+                                            new_bo_size,
+                                            bo_alloc_flags |
+                                            ANV_BO_ALLOC_FIXED_ADDRESS |
+                                            ANV_BO_ALLOC_MAPPED |
+                                            ANV_BO_ALLOC_SNOOPED,
+                                            pool->start_address + pool->size,
+                                            &new_bo);
+      if (result != VK_SUCCESS)
+         return result;
+
+      pool->bos[pool->nbos++] = new_bo;
+
+      /* This pointer will always point to the first BO in the list */
+      pool->bo = pool->bos[0];
+   } else {
+      /* Just leak the old map until we destroy the pool.  We can't munmap it
+       * without races or imposing locking on the block allocate fast path. On
+       * the whole the leaked maps adds up to less than the size of the
+       * current map.  MAP_POPULATE seems like the right thing to do, but we
+       * should try to get some numbers.
+       */
+      void *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_POPULATE, pool->fd,
+                       BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
+      if (map == MAP_FAILED)
+         return vk_errorf(pool->device, VK_ERROR_MEMORY_MAP_FAILED,
+                          "mmap failed: %m");
+
+      struct anv_bo *new_bo;
+      VkResult result = anv_device_import_bo_from_host_ptr(pool->device,
+                                                           map, size,
+                                                           bo_alloc_flags,
+                                                           0 /* client_address */,
+                                                           &new_bo);
+      if (result != VK_SUCCESS) {
+         munmap(map, size);
+         return result;
+      }
+
+      struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups);
+      if (!cleanup) {
+         munmap(map, size);
+         anv_device_release_bo(pool->device, new_bo);
+         return vk_error(pool->device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+      cleanup->map = map;
+      cleanup->size = size;
+
+      /* Now that we mapped the new memory, we can write the new
+       * center_bo_offset back into pool and update pool->map. */
+      pool->center_bo_offset = center_bo_offset;
+      pool->map = map + center_bo_offset;
+
+      pool->bos[pool->nbos++] = new_bo;
+      pool->wrapper_bo.map = new_bo;
+   }
+
+   assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
+   pool->size = size;
+
+   return VK_SUCCESS;
+}
+
+/** Returns current memory map of the block pool.
+ *
+ * The returned pointer points to the map for the memory at the specified
+ * offset. The offset parameter is relative to the "center" of the block pool
+ * rather than the start of the block pool BO map.
+ */
+void*
+anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
+{
+   if (!pool->use_relocations) {
+      struct anv_bo *bo = NULL;
+      int32_t bo_offset = 0;
+      anv_block_pool_foreach_bo(iter_bo, pool) {
+         if (offset < bo_offset + iter_bo->size) {
+            bo = iter_bo;
+            break;
+         }
+         bo_offset += iter_bo->size;
+      }
+      assert(bo != NULL);
+      assert(offset >= bo_offset);
+      assert((offset - bo_offset) + size <= bo->size);
+
+      return bo->map + (offset - bo_offset);
+   } else {
+      return pool->map + offset;
+   }
+}
+
+/** Grows and re-centers the block pool.
+ *
+ * We grow the block pool in one or both directions in such a way that the
+ * following conditions are met:
+ *
+ *  1) The size of the entire pool is always a power of two.
+ *
+ *  2) The pool only grows on both ends.  Neither end can get
+ *     shortened.
+ *
+ *  3) At the end of the allocation, we have about twice as much space
+ *     allocated for each end as we have used.  This way the pool doesn't
+ *     grow too far in one direction or the other.
+ *
+ *  4) If the _alloc_back() has never been called, then the back portion of
+ *     the pool retains a size of zero.  (This makes it easier for users of
+ *     the block pool that only want a one-sided pool.)
+ *
+ *  5) We have enough space allocated for at least one more block in
+ *     whichever side `state` points to.
+ *
+ *  6) The center of the pool is always aligned to both the block_size of
+ *     the pool and a 4K CPU page.
+ */
+static uint32_t
+anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
+                    uint32_t contiguous_size)
+{
+   VkResult result = VK_SUCCESS;
+
+   pthread_mutex_lock(&pool->device->mutex);
+
+   assert(state == &pool->state || state == &pool->back_state);
+
+   /* Gather a little usage information on the pool.  Since we may have
+    * threadsd waiting in queue to get some storage while we resize, it's
+    * actually possible that total_used will be larger than old_size.  In
+    * particular, block_pool_alloc() increments state->next prior to
+    * calling block_pool_grow, so this ensures that we get enough space for
+    * which ever side tries to grow the pool.
+    *
+    * We align to a page size because it makes it easier to do our
+    * calculations later in such a way that we state page-aigned.
+    */
+   uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
+   uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
+   uint32_t total_used = front_used + back_used;
+
+   assert(state == &pool->state || back_used > 0);
+
+   uint32_t old_size = pool->size;
+
+   /* The block pool is always initialized to a nonzero size and this function
+    * is always called after initialization.
+    */
+   assert(old_size > 0);
+
+   const uint32_t old_back = pool->center_bo_offset;
+   const uint32_t old_front = old_size - pool->center_bo_offset;
+
+   /* The back_used and front_used may actually be smaller than the actual
+    * requirement because they are based on the next pointers which are
+    * updated prior to calling this function.
+    */
+   uint32_t back_required = MAX2(back_used, old_back);
+   uint32_t front_required = MAX2(front_used, old_front);
+
+   if (!pool->use_relocations) {
+      /* With softpin, the pool is made up of a bunch of buffers with separate
+       * maps.  Make sure we have enough contiguous space that we can get a
+       * properly contiguous map for the next chunk.
+       */
+      assert(old_back == 0);
+      front_required = MAX2(front_required, old_front + contiguous_size);
+   }
+
+   if (back_used * 2 <= back_required && front_used * 2 <= front_required) {
+      /* If we're in this case then this isn't the firsta allocation and we
+       * already have enough space on both sides to hold double what we
+       * have allocated.  There's nothing for us to do.
+       */
+      goto done;
+   }
+
+   uint32_t size = old_size * 2;
+   while (size < back_required + front_required)
+      size *= 2;
+
+   assert(size > pool->size);
+
+   /* We compute a new center_bo_offset such that, when we double the size
+    * of the pool, we maintain the ratio of how much is used by each side.
+    * This way things should remain more-or-less balanced.
+    */
+   uint32_t center_bo_offset;
+   if (back_used == 0) {
+      /* If we're in this case then we have never called alloc_back().  In
+       * this case, we want keep the offset at 0 to make things as simple
+       * as possible for users that don't care about back allocations.
+       */
+      center_bo_offset = 0;
+   } else {
+      /* Try to "center" the allocation based on how much is currently in
+       * use on each side of the center line.
+       */
+      center_bo_offset = ((uint64_t)size * back_used) / total_used;
+
+      /* Align down to a multiple of the page size */
+      center_bo_offset &= ~(PAGE_SIZE - 1);
+
+      assert(center_bo_offset >= back_used);
+
+      /* Make sure we don't shrink the back end of the pool */
+      if (center_bo_offset < back_required)
+         center_bo_offset = back_required;
+
+      /* Make sure that we don't shrink the front end of the pool */
+      if (size - center_bo_offset < front_required)
+         center_bo_offset = size - front_required;
+   }
+
+   assert(center_bo_offset % PAGE_SIZE == 0);
+
+   result = anv_block_pool_expand_range(pool, center_bo_offset, size);
+
+done:
+   pthread_mutex_unlock(&pool->device->mutex);
+
+   if (result == VK_SUCCESS) {
+      /* Return the appropriate new size.  This function never actually
+       * updates state->next.  Instead, we let the caller do that because it
+       * needs to do so in order to maintain its concurrency model.
+       */
+      if (state == &pool->state) {
+         return pool->size - pool->center_bo_offset;
+      } else {
+         assert(pool->center_bo_offset > 0);
+         return pool->center_bo_offset;
+      }
+   } else {
+      return 0;
+   }
+}
+
+static uint32_t
+anv_block_pool_alloc_new(struct anv_block_pool *pool,
+                         struct anv_block_state *pool_state,
+                         uint32_t block_size, uint32_t *padding)
+{
+   struct anv_block_state state, old, new;
+
+   /* Most allocations won't generate any padding */
+   if (padding)
+      *padding = 0;
+
+   while (1) {
+      state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
+      if (state.next + block_size <= state.end) {
+         return state.next;
+      } else if (state.next <= state.end) {
+         if (!pool->use_relocations && state.next < state.end) {
+            /* We need to grow the block pool, but still have some leftover
+             * space that can't be used by that particular allocation. So we
+             * add that as a "padding", and return it.
+             */
+            uint32_t leftover = state.end - state.next;
+
+            /* If there is some leftover space in the pool, the caller must
+             * deal with it.
+             */
+            assert(leftover == 0 || padding);
+            if (padding)
+               *padding = leftover;
+            state.next += leftover;
+         }
+
+         /* We allocated the first block outside the pool so we have to grow
+          * the pool.  pool_state->next acts a mutex: threads who try to
+          * allocate now will get block indexes above the current limit and
+          * hit futex_wait below.
+          */
+         new.next = state.next + block_size;
+         do {
+            new.end = anv_block_pool_grow(pool, pool_state, block_size);
+         } while (new.end < new.next);
+
+         old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
+         if (old.next != state.next)
+            futex_wake(&pool_state->end, INT_MAX);
+         return state.next;
+      } else {
+         futex_wait(&pool_state->end, state.end, NULL);
+         continue;
+      }
+   }
+}
+
+int32_t
+anv_block_pool_alloc(struct anv_block_pool *pool,
+                     uint32_t block_size, uint32_t *padding)
+{
+   uint32_t offset;
+
+   offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding);
+
+   return offset;
+}
+
+/* Allocates a block out of the back of the block pool.
+ *
+ * This will allocated a block earlier than the "start" of the block pool.
+ * The offsets returned from this function will be negative but will still
+ * be correct relative to the block pool's map pointer.
+ *
+ * If you ever use anv_block_pool_alloc_back, then you will have to do
+ * gymnastics with the block pool's BO when doing relocations.
+ */
+int32_t
+anv_block_pool_alloc_back(struct anv_block_pool *pool,
+                          uint32_t block_size)
+{
+   int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,
+                                             block_size, NULL);
+
+   /* The offset we get out of anv_block_pool_alloc_new() is actually the
+    * number of bytes downwards from the middle to the end of the block.
+    * We need to turn it into a (negative) offset from the middle to the
+    * start of the block.
+    */
+   assert(offset >= 0);
+   return -(offset + block_size);
+}
+
+VkResult
+anv_state_pool_init(struct anv_state_pool *pool,
+                    struct anv_device *device,
+                    const char *name,
+                    uint64_t base_address,
+                    int32_t start_offset,
+                    uint32_t block_size)
+{
+   /* We don't want to ever see signed overflow */
+   assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE);
+
+   uint32_t initial_size = block_size * 16;
+   if (device->info->verx10 >= 125)
+      initial_size = MAX2(initial_size, 2 * 1024 * 1024);
+
+   VkResult result = anv_block_pool_init(&pool->block_pool, device, name,
+                                         base_address + start_offset,
+                                         initial_size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pool->start_offset = start_offset;
+
+   result = anv_state_table_init(&pool->table, device, 64);
+   if (result != VK_SUCCESS) {
+      anv_block_pool_finish(&pool->block_pool);
+      return result;
+   }
+
+   assert(util_is_power_of_two_or_zero(block_size));
+   pool->block_size = block_size;
+   pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;
+   for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
+      pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
+      pool->buckets[i].block.next = 0;
+      pool->buckets[i].block.end = 0;
+   }
+   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
+
+   return VK_SUCCESS;
+}
+
+void
+anv_state_pool_finish(struct anv_state_pool *pool)
+{
+   VG(VALGRIND_DESTROY_MEMPOOL(pool));
+   anv_state_table_finish(&pool->table);
+   anv_block_pool_finish(&pool->block_pool);
+}
+
+static uint32_t
+anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
+                                    struct anv_block_pool *block_pool,
+                                    uint32_t state_size,
+                                    uint32_t block_size,
+                                    uint32_t *padding)
+{
+   struct anv_block_state block, old, new;
+   uint32_t offset;
+
+   /* We don't always use anv_block_pool_alloc(), which would set *padding to
+    * zero for us. So if we have a pointer to padding, we must zero it out
+    * ourselves here, to make sure we always return some sensible value.
+    */
+   if (padding)
+      *padding = 0;
+
+   /* If our state is large, we don't need any sub-allocation from a block.
+    * Instead, we just grab whole (potentially large) blocks.
+    */
+   if (state_size >= block_size)
+      return anv_block_pool_alloc(block_pool, state_size, padding);
+
+ restart:
+   block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
+
+   if (block.next < block.end) {
+      return block.next;
+   } else if (block.next == block.end) {
+      offset = anv_block_pool_alloc(block_pool, block_size, padding);
+      new.next = offset + state_size;
+      new.end = offset + block_size;
+      old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
+      if (old.next != block.next)
+         futex_wake(&pool->block.end, INT_MAX);
+      return offset;
+   } else {
+      futex_wait(&pool->block.end, block.end, NULL);
+      goto restart;
+   }
+}
+
+static uint32_t
+anv_state_pool_get_bucket(uint32_t size)
+{
+   unsigned size_log2 = ilog2_round_up(size);
+   assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
+   if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
+      size_log2 = ANV_MIN_STATE_SIZE_LOG2;
+   return size_log2 - ANV_MIN_STATE_SIZE_LOG2;
+}
+
+static uint32_t
+anv_state_pool_get_bucket_size(uint32_t bucket)
+{
+   uint32_t size_log2 = bucket + ANV_MIN_STATE_SIZE_LOG2;
+   return 1 << size_log2;
+}
+
+/** Helper to push a chunk into the state table.
+ *
+ * It creates 'count' entries into the state table and update their sizes,
+ * offsets and maps, also pushing them as "free" states.
+ */
+static void
+anv_state_pool_return_blocks(struct anv_state_pool *pool,
+                             uint32_t chunk_offset, uint32_t count,
+                             uint32_t block_size)
+{
+   /* Disallow returning 0 chunks */
+   assert(count != 0);
+
+   /* Make sure we always return chunks aligned to the block_size */
+   assert(chunk_offset % block_size == 0);
+
+   uint32_t st_idx;
+   UNUSED VkResult result = anv_state_table_add(&pool->table, &st_idx, count);
+   assert(result == VK_SUCCESS);
+   for (int i = 0; i < count; i++) {
+      /* update states that were added back to the state table */
+      struct anv_state *state_i = anv_state_table_get(&pool->table,
+                                                      st_idx + i);
+      state_i->alloc_size = block_size;
+      state_i->offset = pool->start_offset + chunk_offset + block_size * i;
+      state_i->map = anv_block_pool_map(&pool->block_pool,
+                                        state_i->offset,
+                                        state_i->alloc_size);
+   }
+
+   uint32_t block_bucket = anv_state_pool_get_bucket(block_size);
+   anv_free_list_push(&pool->buckets[block_bucket].free_list,
+                      &pool->table, st_idx, count);
+}
+
+/** Returns a chunk of memory back to the state pool.
+ *
+ * Do a two-level split. If chunk_size is bigger than divisor
+ * (pool->block_size), we return as many divisor sized blocks as we can, from
+ * the end of the chunk.
+ *
+ * The remaining is then split into smaller blocks (starting at small_size if
+ * it is non-zero), with larger blocks always being taken from the end of the
+ * chunk.
+ */
+static void
+anv_state_pool_return_chunk(struct anv_state_pool *pool,
+                            uint32_t chunk_offset, uint32_t chunk_size,
+                            uint32_t small_size)
+{
+   uint32_t divisor = pool->block_size;
+   uint32_t nblocks = chunk_size / divisor;
+   uint32_t rest = chunk_size - nblocks * divisor;
+
+   if (nblocks > 0) {
+      /* First return divisor aligned and sized chunks. We start returning
+       * larger blocks from the end of the chunk, since they should already be
+       * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
+       * aligned chunks.
+       */
+      uint32_t offset = chunk_offset + rest;
+      anv_state_pool_return_blocks(pool, offset, nblocks, divisor);
+   }
+
+   chunk_size = rest;
+   divisor /= 2;
+
+   if (small_size > 0 && small_size < divisor)
+      divisor = small_size;
+
+   uint32_t min_size = 1 << ANV_MIN_STATE_SIZE_LOG2;
+
+   /* Just as before, return larger divisor aligned blocks from the end of the
+    * chunk first.
+    */
+   while (chunk_size > 0 && divisor >= min_size) {
+      nblocks = chunk_size / divisor;
+      rest = chunk_size - nblocks * divisor;
+      if (nblocks > 0) {
+         anv_state_pool_return_blocks(pool, chunk_offset + rest,
+                                      nblocks, divisor);
+         chunk_size = rest;
+      }
+      divisor /= 2;
+   }
+}
+
+static struct anv_state
+anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
+                           uint32_t size, uint32_t align)
+{
+   uint32_t bucket = anv_state_pool_get_bucket(MAX2(size, align));
+
+   struct anv_state *state;
+   uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
+   int32_t offset;
+
+   /* Try free list first. */
+   state = anv_free_list_pop(&pool->buckets[bucket].free_list,
+                             &pool->table);
+   if (state) {
+      assert(state->offset >= pool->start_offset);
+      goto done;
+   }
+
+   /* Try to grab a chunk from some larger bucket and split it up */
+   for (unsigned b = bucket + 1; b < ANV_STATE_BUCKETS; b++) {
+      state = anv_free_list_pop(&pool->buckets[b].free_list, &pool->table);
+      if (state) {
+         unsigned chunk_size = anv_state_pool_get_bucket_size(b);
+         int32_t chunk_offset = state->offset;
+
+         /* First lets update the state we got to its new size. offset and map
+          * remain the same.
+          */
+         state->alloc_size = alloc_size;
+
+         /* Now return the unused part of the chunk back to the pool as free
+          * blocks
+          *
+          * There are a couple of options as to what we do with it:
+          *
+          *    1) We could fully split the chunk into state.alloc_size sized
+          *       pieces.  However, this would mean that allocating a 16B
+          *       state could potentially split a 2MB chunk into 512K smaller
+          *       chunks.  This would lead to unnecessary fragmentation.
+          *
+          *    2) The classic "buddy allocator" method would have us split the
+          *       chunk in half and return one half.  Then we would split the
+          *       remaining half in half and return one half, and repeat as
+          *       needed until we get down to the size we want.  However, if
+          *       you are allocating a bunch of the same size state (which is
+          *       the common case), this means that every other allocation has
+          *       to go up a level and every fourth goes up two levels, etc.
+          *       This is not nearly as efficient as it could be if we did a
+          *       little more work up-front.
+          *
+          *    3) Split the difference between (1) and (2) by doing a
+          *       two-level split.  If it's bigger than some fixed block_size,
+          *       we split it into block_size sized chunks and return all but
+          *       one of them.  Then we split what remains into
+          *       state.alloc_size sized chunks and return them.
+          *
+          * We choose something close to option (3), which is implemented with
+          * anv_state_pool_return_chunk(). That is done by returning the
+          * remaining of the chunk, with alloc_size as a hint of the size that
+          * we want the smaller chunk split into.
+          */
+         anv_state_pool_return_chunk(pool, chunk_offset + alloc_size,
+                                     chunk_size - alloc_size, alloc_size);
+         goto done;
+      }
+   }
+
+   uint32_t padding;
+   offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
+                                                &pool->block_pool,
+                                                alloc_size,
+                                                pool->block_size,
+                                                &padding);
+   /* Every time we allocate a new state, add it to the state pool */
+   uint32_t idx;
+   UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
+   assert(result == VK_SUCCESS);
+
+   state = anv_state_table_get(&pool->table, idx);
+   state->offset = pool->start_offset + offset;
+   state->alloc_size = alloc_size;
+   state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
+
+   if (padding > 0) {
+      uint32_t return_offset = offset - padding;
+      anv_state_pool_return_chunk(pool, return_offset, padding, 0);
+   }
+
+done:
+   return *state;
+}
+
+struct anv_state
+anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
+{
+   if (size == 0)
+      return ANV_STATE_NULL;
+
+   struct anv_state state = anv_state_pool_alloc_no_vg(pool, size, align);
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size));
+   return state;
+}
+
+struct anv_state
+anv_state_pool_alloc_back(struct anv_state_pool *pool)
+{
+   struct anv_state *state;
+   uint32_t alloc_size = pool->block_size;
+
+   /* This function is only used with pools where start_offset == 0 */
+   assert(pool->start_offset == 0);
+
+   state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table);
+   if (state) {
+      assert(state->offset < pool->start_offset);
+      goto done;
+   }
+
+   int32_t offset;
+   offset = anv_block_pool_alloc_back(&pool->block_pool,
+                                      pool->block_size);
+   uint32_t idx;
+   UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
+   assert(result == VK_SUCCESS);
+
+   state = anv_state_table_get(&pool->table, idx);
+   state->offset = pool->start_offset + offset;
+   state->alloc_size = alloc_size;
+   state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
+
+done:
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size));
+   return *state;
+}
+
+static void
+anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
+{
+   assert(util_is_power_of_two_or_zero(state.alloc_size));
+   unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
+
+   if (state.offset < pool->start_offset) {
+      assert(state.alloc_size == pool->block_size);
+      anv_free_list_push(&pool->back_alloc_free_list,
+                         &pool->table, state.idx, 1);
+   } else {
+      anv_free_list_push(&pool->buckets[bucket].free_list,
+                         &pool->table, state.idx, 1);
+   }
+}
+
+void
+anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state)
+{
+   if (state.alloc_size == 0)
+      return;
+
+   VG(VALGRIND_MEMPOOL_FREE(pool, state.map));
+   anv_state_pool_free_no_vg(pool, state);
+}
+
+struct anv_state_stream_block {
+   struct anv_state block;
+
+   /* The next block */
+   struct anv_state_stream_block *next;
+
+#ifdef HAVE_VALGRIND
+   /* A pointer to the first user-allocated thing in this block.  This is
+    * what valgrind sees as the start of the block.
+    */
+   void *_vg_ptr;
+#endif
+};
+
+/* The state stream allocator is a one-shot, single threaded allocator for
+ * variable sized blocks.  We use it for allocating dynamic state.
+ */
+void
+anv_state_stream_init(struct anv_state_stream *stream,
+                      struct anv_state_pool *state_pool,
+                      uint32_t block_size)
+{
+   stream->state_pool = state_pool;
+   stream->block_size = block_size;
+
+   stream->block = ANV_STATE_NULL;
+
+   /* Ensure that next + whatever > block_size.  This way the first call to
+    * state_stream_alloc fetches a new block.
+    */
+   stream->next = block_size;
+
+   util_dynarray_init(&stream->all_blocks, NULL);
+
+   VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
+}
+
+void
+anv_state_stream_finish(struct anv_state_stream *stream)
+{
+   util_dynarray_foreach(&stream->all_blocks, struct anv_state, block) {
+      VG(VALGRIND_MEMPOOL_FREE(stream, block->map));
+      VG(VALGRIND_MAKE_MEM_NOACCESS(block->map, block->alloc_size));
+      anv_state_pool_free_no_vg(stream->state_pool, *block);
+   }
+   util_dynarray_fini(&stream->all_blocks);
+
+   VG(VALGRIND_DESTROY_MEMPOOL(stream));
+}
+
+struct anv_state
+anv_state_stream_alloc(struct anv_state_stream *stream,
+                       uint32_t size, uint32_t alignment)
+{
+   if (size == 0)
+      return ANV_STATE_NULL;
+
+   assert(alignment <= PAGE_SIZE);
+
+   uint32_t offset = align_u32(stream->next, alignment);
+   if (offset + size > stream->block.alloc_size) {
+      uint32_t block_size = stream->block_size;
+      if (block_size < size)
+         block_size = round_to_power_of_two(size);
+
+      stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
+                                                 block_size, PAGE_SIZE);
+      util_dynarray_append(&stream->all_blocks,
+                           struct anv_state, stream->block);
+      VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
+
+      /* Reset back to the start */
+      stream->next = offset = 0;
+      assert(offset + size <= stream->block.alloc_size);
+   }
+   const bool new_block = stream->next == 0;
+
+   struct anv_state state = stream->block;
+   state.offset += offset;
+   state.alloc_size = size;
+   state.map += offset;
+
+   stream->next = offset + size;
+
+   if (new_block) {
+      assert(state.map == stream->block.map);
+      VG(VALGRIND_MEMPOOL_ALLOC(stream, state.map, size));
+   } else {
+      /* This only updates the mempool.  The newly allocated chunk is still
+       * marked as NOACCESS. */
+      VG(VALGRIND_MEMPOOL_CHANGE(stream, stream->block.map, stream->block.map,
+                                 stream->next));
+      /* Mark the newly allocated chunk as undefined */
+      VG(VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size));
+   }
+
+   return state;
+}
+
+void
+anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
+                             struct anv_state_pool *parent,
+                             uint32_t count, uint32_t size, uint32_t alignment)
+{
+   pool->pool = parent;
+   pool->reserved_blocks = ANV_FREE_LIST_EMPTY;
+   pool->count = count;
+
+   for (unsigned i = 0; i < count; i++) {
+      struct anv_state state = anv_state_pool_alloc(pool->pool, size, alignment);
+      anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
+   }
+}
+
+void
+anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool)
+{
+   struct anv_state *state;
+
+   while ((state = anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table))) {
+      anv_state_pool_free(pool->pool, *state);
+      pool->count--;
+   }
+   assert(pool->count == 0);
+}
+
+struct anv_state
+anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool)
+{
+   return *anv_free_list_pop(&pool->reserved_blocks, &pool->pool->table);
+}
+
+void
+anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
+                             struct anv_state state)
+{
+   anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
+}
+
+void
+anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
+                 const char *name)
+{
+   pool->name = name;
+   pool->device = device;
+   for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
+      util_sparse_array_free_list_init(&pool->free_list[i],
+                                       &device->bo_cache.bo_map, 0,
+                                       offsetof(struct anv_bo, free_index));
+   }
+
+   VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false));
+}
+
+void
+anv_bo_pool_finish(struct anv_bo_pool *pool)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
+      while (1) {
+         struct anv_bo *bo =
+            util_sparse_array_free_list_pop_elem(&pool->free_list[i]);
+         if (bo == NULL)
+            break;
+
+         /* anv_device_release_bo is going to "free" it */
+         VG(VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1));
+         anv_device_release_bo(pool->device, bo);
+      }
+   }
+
+   VG(VALGRIND_DESTROY_MEMPOOL(pool));
+}
+
+VkResult
+anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
+                  struct anv_bo **bo_out)
+{
+   const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size);
+   const unsigned pow2_size = 1 << size_log2;
+   const unsigned bucket = size_log2 - 12;
+   assert(bucket < ARRAY_SIZE(pool->free_list));
+
+   struct anv_bo *bo =
+      util_sparse_array_free_list_pop_elem(&pool->free_list[bucket]);
+   if (bo != NULL) {
+      VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
+      *bo_out = bo;
+      return VK_SUCCESS;
+   }
+
+   VkResult result = anv_device_alloc_bo(pool->device,
+                                         pool->name,
+                                         pow2_size,
+                                         ANV_BO_ALLOC_MAPPED |
+                                         ANV_BO_ALLOC_SNOOPED |
+                                         ANV_BO_ALLOC_CAPTURE,
+                                         0 /* explicit_address */,
+                                         &bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* We want it to look like it came from this pool */
+   VG(VALGRIND_FREELIKE_BLOCK(bo->map, 0));
+   VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size));
+
+   *bo_out = bo;
+
+   return VK_SUCCESS;
+}
+
+void
+anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
+{
+   VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
+
+   assert(util_is_power_of_two_or_zero(bo->size));
+   const unsigned size_log2 = ilog2_round_up(bo->size);
+   const unsigned bucket = size_log2 - 12;
+   assert(bucket < ARRAY_SIZE(pool->free_list));
+
+   assert(util_sparse_array_get(&pool->device->bo_cache.bo_map,
+                                bo->gem_handle) == bo);
+   util_sparse_array_free_list_push(&pool->free_list[bucket],
+                                    &bo->gem_handle, 1);
+}
+
+// Scratch pool
+
+void
+anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool)
+{
+   memset(pool, 0, sizeof(*pool));
+}
+
+void
+anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool)
+{
+   for (unsigned s = 0; s < ARRAY_SIZE(pool->bos[0]); s++) {
+      for (unsigned i = 0; i < 16; i++) {
+         if (pool->bos[i][s] != NULL)
+            anv_device_release_bo(device, pool->bos[i][s]);
+      }
+   }
+
+   for (unsigned i = 0; i < 16; i++) {
+      if (pool->surf_states[i].map != NULL) {
+         anv_state_pool_free(&device->surface_state_pool,
+                             pool->surf_states[i]);
+      }
+   }
+}
+
+struct anv_bo *
+anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
+                       gl_shader_stage stage, unsigned per_thread_scratch)
+{
+   if (per_thread_scratch == 0)
+      return NULL;
+
+   unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
+   assert(scratch_size_log2 < 16);
+
+   assert(stage < ARRAY_SIZE(pool->bos));
+
+   const struct intel_device_info *devinfo = device->info;
+
+   /* On GFX version 12.5, scratch access changed to a surface-based model.
+    * Instead of each shader type having its own layout based on IDs passed
+    * from the relevant fixed-function unit, all scratch access is based on
+    * thread IDs like it always has been for compute.
+    */
+   if (devinfo->verx10 >= 125)
+      stage = MESA_SHADER_COMPUTE;
+
+   struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
+
+   if (bo != NULL)
+      return bo;
+
+   assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
+   uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
+
+   /* Even though the Scratch base pointers in 3DSTATE_*S are 64 bits, they
+    * are still relative to the general state base address.  When we emit
+    * STATE_BASE_ADDRESS, we set general state base address to 0 and the size
+    * to the maximum (1 page under 4GB).  This allows us to just place the
+    * scratch buffers anywhere we wish in the bottom 32 bits of address space
+    * and just set the scratch base pointer in 3DSTATE_*S using a relocation.
+    * However, in order to do so, we need to ensure that the kernel does not
+    * place the scratch BO above the 32-bit boundary.
+    *
+    * NOTE: Technically, it can't go "anywhere" because the top page is off
+    * limits.  However, when EXEC_OBJECT_SUPPORTS_48B_ADDRESS is set, the
+    * kernel allocates space using
+    *
+    *    end = min_t(u64, end, (1ULL << 32) - I915_GTT_PAGE_SIZE);
+    *
+    * so nothing will ever touch the top page.
+    */
+   const enum anv_bo_alloc_flags alloc_flags =
+      devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0;
+   VkResult result = anv_device_alloc_bo(device, "scratch", size,
+                                         alloc_flags,
+                                         0 /* explicit_address */,
+                                         &bo);
+   if (result != VK_SUCCESS)
+      return NULL; /* TODO */
+
+   struct anv_bo *current_bo =
+      p_atomic_cmpxchg(&pool->bos[scratch_size_log2][stage], NULL, bo);
+   if (current_bo) {
+      anv_device_release_bo(device, bo);
+      return current_bo;
+   } else {
+      return bo;
+   }
+}
+
+uint32_t
+anv_scratch_pool_get_surf(struct anv_device *device,
+                          struct anv_scratch_pool *pool,
+                          unsigned per_thread_scratch)
+{
+   if (per_thread_scratch == 0)
+      return 0;
+
+   unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
+   assert(scratch_size_log2 < 16);
+
+   uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
+   if (surf > 0)
+      return surf;
+
+   struct anv_bo *bo =
+      anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
+                             per_thread_scratch);
+   struct anv_address addr = { .bo = bo };
+
+   struct anv_state state =
+      anv_state_pool_alloc(&device->surface_state_pool,
+                           device->isl_dev.ss.size, 64);
+
+   isl_buffer_fill_state(&device->isl_dev, state.map,
+                         .address = anv_address_physical(addr),
+                         .size_B = bo->size,
+                         .mocs = anv_mocs(device, bo, 0),
+                         .format = ISL_FORMAT_RAW,
+                         .swizzle = ISL_SWIZZLE_IDENTITY,
+                         .stride_B = per_thread_scratch,
+                         .is_scratch = true);
+
+   uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
+                                       0, state.offset);
+   if (current) {
+      anv_state_pool_free(&device->surface_state_pool, state);
+      return current;
+   } else {
+      pool->surf_states[scratch_size_log2] = state;
+      return state.offset;
+   }
+}
+
+VkResult
+anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
+{
+   util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
+
+   if (pthread_mutex_init(&cache->mutex, NULL)) {
+      util_sparse_array_finish(&cache->bo_map);
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "pthread_mutex_init failed: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_bo_cache_finish(struct anv_bo_cache *cache)
+{
+   util_sparse_array_finish(&cache->bo_map);
+   pthread_mutex_destroy(&cache->mutex);
+}
+
+#define ANV_BO_CACHE_SUPPORTED_FLAGS \
+   (EXEC_OBJECT_WRITE | \
+    EXEC_OBJECT_ASYNC | \
+    EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \
+    EXEC_OBJECT_PINNED | \
+    EXEC_OBJECT_CAPTURE)
+
+static uint32_t
+anv_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                               enum anv_bo_alloc_flags alloc_flags)
+{
+   struct anv_physical_device *pdevice = device->physical;
+
+   uint64_t bo_flags = 0;
+   if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) &&
+       pdevice->supports_48bit_addresses)
+      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+   if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture)
+      bo_flags |= EXEC_OBJECT_CAPTURE;
+
+   if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
+      assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
+      bo_flags |= EXEC_OBJECT_WRITE;
+   }
+
+   if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
+      bo_flags |= EXEC_OBJECT_ASYNC;
+
+   if (pdevice->use_softpin)
+      bo_flags |= EXEC_OBJECT_PINNED;
+
+   return bo_flags;
+}
+
+static void
+anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
+{
+   if (bo->offset != 0 && anv_bo_is_pinned(bo) && !bo->has_fixed_address)
+      anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);
+
+   if (bo->map && !bo->from_host_ptr)
+      anv_device_unmap_bo(device, bo, bo->map, bo->size);
+
+   assert(bo->gem_handle != 0);
+   anv_gem_close(device, bo->gem_handle);
+}
+
+static VkResult
+anv_bo_vma_alloc_or_close(struct anv_device *device,
+                          struct anv_bo *bo,
+                          enum anv_bo_alloc_flags alloc_flags,
+                          uint64_t explicit_address)
+{
+   assert(anv_bo_is_pinned(bo));
+   assert(explicit_address == intel_48b_address(explicit_address));
+
+   uint32_t align = 4096;
+
+   /* Gen12 CCS surface addresses need to be 64K aligned. */
+   if (device->info->ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))
+      align = 64 * 1024;
+
+   /* For XeHP, lmem and smem cannot share a single PDE, which means they
+    * can't live in the same 2MiB aligned region.
+    */
+   if (device->info->verx10 >= 125)
+       align = 2 * 1024 * 1024;
+
+   if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
+      bo->has_fixed_address = true;
+      bo->offset = explicit_address;
+   } else {
+      bo->offset = anv_vma_alloc(device, bo->size + bo->_ccs_size,
+                                 align, alloc_flags, explicit_address);
+      if (bo->offset == 0) {
+         anv_bo_finish(device, bo);
+         return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "failed to allocate virtual address for BO");
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_alloc_bo(struct anv_device *device,
+                    const char *name,
+                    uint64_t size,
+                    enum anv_bo_alloc_flags alloc_flags,
+                    uint64_t explicit_address,
+                    struct anv_bo **bo_out)
+{
+   if (!device->physical->has_implicit_ccs)
+      assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+
+   const uint32_t bo_flags =
+      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
+   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+
+   /* The kernel is going to give us whole pages anyway */
+   size = align_u64(size, 4096);
+
+   uint64_t ccs_size = 0;
+   if (device->info->has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {
+      /* Align the size up to the next multiple of 64K so we don't have any
+       * AUX-TT entries pointing from a 64K page to itself.
+       */
+      size = align_u64(size, 64 * 1024);
+
+      /* See anv_bo::_ccs_size */
+      ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096);
+   }
+
+   uint32_t gem_handle;
+
+   /* If we have vram size, we have multiple memory regions and should choose
+    * one of them.
+    */
+   if (anv_physical_device_has_vram(device->physical)) {
+      struct drm_i915_gem_memory_class_instance regions[2];
+      uint32_t nregions = 0;
+
+      /* This always try to put the object in local memory. Here
+       * vram_non_mappable & vram_mappable actually are the same region.
+       */
+      regions[nregions++] = device->physical->vram_non_mappable.region;
+
+      /* If the buffer is mapped on the host, add the system memory region.
+       * This ensures that if the buffer cannot live in mappable local memory,
+       * it can be spilled to system memory.
+       */
+      uint32_t flags = 0;
+      if ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
+          (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)) {
+         regions[nregions++] = device->physical->sys.region;
+         if (device->physical->vram_non_mappable.size > 0)
+            flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS;
+      }
+
+      gem_handle = anv_gem_create_regions(device, size + ccs_size,
+                                          flags, nregions, regions);
+   } else {
+      gem_handle = anv_gem_create(device, size + ccs_size);
+   }
+
+   if (gem_handle == 0)
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   struct anv_bo new_bo = {
+      .name = name,
+      .gem_handle = gem_handle,
+      .refcount = 1,
+      .offset = -1,
+      .size = size,
+      ._ccs_size = ccs_size,
+      .flags = bo_flags,
+      .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),
+      .has_client_visible_address =
+         (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+      .has_implicit_ccs = ccs_size > 0 || device->info->verx10 >= 125,
+   };
+
+   if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
+      VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
+                                          0 /* gem_flags */, &new_bo.map);
+      if (unlikely(result != VK_SUCCESS)) {
+         anv_gem_close(device, new_bo.gem_handle);
+         return result;
+      }
+   }
+
+   if (alloc_flags & ANV_BO_ALLOC_SNOOPED) {
+      assert(alloc_flags & ANV_BO_ALLOC_MAPPED);
+      /* We don't want to change these defaults if it's going to be shared
+       * with another process.
+       */
+      assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
+
+      /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+       * I915_CACHING_NONE on non-LLC platforms.  For many internal state
+       * objects, we'd rather take the snooping overhead than risk forgetting
+       * a CLFLUSH somewhere.  Userptr objects are always created as
+       * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
+       * need to do this there.
+       */
+      if (!device->info->has_llc) {
+         anv_gem_set_caching(device, new_bo.gem_handle,
+                             I915_CACHING_CACHED);
+      }
+   }
+
+   if (anv_bo_is_pinned(&new_bo)) {
+      VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                                  alloc_flags,
+                                                  explicit_address);
+      if (result != VK_SUCCESS)
+         return result;
+   } else {
+      assert(!new_bo.has_client_visible_address);
+   }
+
+   if (new_bo._ccs_size > 0) {
+      assert(device->info->has_aux_map);
+      intel_aux_map_add_mapping(device->aux_map_ctx,
+                                intel_canonical_address(new_bo.offset),
+                                intel_canonical_address(new_bo.offset + new_bo.size),
+                                new_bo.size, 0 /* format_bits */);
+   }
+
+   assert(new_bo.gem_handle);
+
+   /* If we just got this gem_handle from anv_bo_init_new then we know no one
+    * else is touching this BO at the moment so we don't need to lock here.
+    */
+   struct anv_bo *bo = anv_device_lookup_bo(device, new_bo.gem_handle);
+   *bo = new_bo;
+
+   *bo_out = bo;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_map_bo(struct anv_device *device,
+                  struct anv_bo *bo,
+                  uint64_t offset,
+                  size_t size,
+                  uint32_t gem_flags,
+                  void **map_out)
+{
+   assert(!bo->is_wrapper && !bo->from_host_ptr);
+   assert(size > 0);
+
+   void *map = anv_gem_mmap(device, bo->gem_handle, offset, size, gem_flags);
+   if (unlikely(map == MAP_FAILED))
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
+
+   assert(map != NULL);
+
+   if (map_out)
+      *map_out = map;
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_unmap_bo(struct anv_device *device,
+                    struct anv_bo *bo,
+                    void *map, size_t map_size)
+{
+   assert(!bo->is_wrapper && !bo->from_host_ptr);
+
+   anv_gem_munmap(device, map, map_size);
+}
+
+VkResult
+anv_device_import_bo_from_host_ptr(struct anv_device *device,
+                                   void *host_ptr, uint32_t size,
+                                   enum anv_bo_alloc_flags alloc_flags,
+                                   uint64_t client_address,
+                                   struct anv_bo **bo_out)
+{
+   assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
+                           ANV_BO_ALLOC_SNOOPED |
+                           ANV_BO_ALLOC_FIXED_ADDRESS)));
+
+   assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) ||
+          (device->physical->has_implicit_ccs && device->info->has_aux_map));
+
+   struct anv_bo_cache *cache = &device->bo_cache;
+   const uint32_t bo_flags =
+      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
+   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+
+   uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size);
+   if (!gem_handle)
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+
+   pthread_mutex_lock(&cache->mutex);
+
+   struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+   if (bo->refcount > 0) {
+      /* VK_EXT_external_memory_host doesn't require handling importing the
+       * same pointer twice at the same time, but we don't get in the way.  If
+       * kernel gives us the same gem_handle, only succeed if the flags match.
+       */
+      assert(bo->gem_handle == gem_handle);
+      if (bo_flags != bo->flags) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "same host pointer imported two different ways");
+      }
+
+      if (bo->has_client_visible_address !=
+          ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported with and without buffer "
+                          "device address");
+      }
+
+      if (client_address && client_address != intel_48b_address(bo->offset)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported at two different "
+                          "addresses");
+      }
+
+      __sync_fetch_and_add(&bo->refcount, 1);
+   } else {
+      struct anv_bo new_bo = {
+         .name = "host-ptr",
+         .gem_handle = gem_handle,
+         .refcount = 1,
+         .offset = -1,
+         .size = size,
+         .map = host_ptr,
+         .flags = bo_flags,
+         .is_external = true,
+         .from_host_ptr = true,
+         .has_client_visible_address =
+            (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+      };
+
+      if (anv_bo_is_pinned(&new_bo)) {
+         VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                                     alloc_flags,
+                                                     client_address);
+         if (result != VK_SUCCESS) {
+            pthread_mutex_unlock(&cache->mutex);
+            return result;
+         }
+      } else {
+         assert(!new_bo.has_client_visible_address);
+      }
+
+      *bo = new_bo;
+   }
+
+   pthread_mutex_unlock(&cache->mutex);
+   *bo_out = bo;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_import_bo(struct anv_device *device,
+                     int fd,
+                     enum anv_bo_alloc_flags alloc_flags,
+                     uint64_t client_address,
+                     struct anv_bo **bo_out)
+{
+   assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
+                           ANV_BO_ALLOC_SNOOPED |
+                           ANV_BO_ALLOC_FIXED_ADDRESS)));
+
+   assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS) ||
+          (device->physical->has_implicit_ccs && device->info->has_aux_map));
+
+   struct anv_bo_cache *cache = &device->bo_cache;
+   const uint32_t bo_flags =
+      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
+   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+
+   pthread_mutex_lock(&cache->mutex);
+
+   uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
+   if (!gem_handle) {
+      pthread_mutex_unlock(&cache->mutex);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+
+   struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+   if (bo->refcount > 0) {
+      /* We have to be careful how we combine flags so that it makes sense.
+       * Really, though, if we get to this case and it actually matters, the
+       * client has imported a BO twice in different ways and they get what
+       * they have coming.
+       */
+      uint64_t new_flags = 0;
+      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
+      new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
+      new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
+      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
+
+      /* It's theoretically possible for a BO to get imported such that it's
+       * both pinned and not pinned.  The only way this can happen is if it
+       * gets imported as both a semaphore and a memory object and that would
+       * be an application error.  Just fail out in that case.
+       */
+      if ((bo->flags & EXEC_OBJECT_PINNED) !=
+          (bo_flags & EXEC_OBJECT_PINNED)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported two different ways");
+      }
+
+      /* It's also theoretically possible that someone could export a BO from
+       * one heap and import it into another or to import the same BO into two
+       * different heaps.  If this happens, we could potentially end up both
+       * allowing and disallowing 48-bit addresses.  There's not much we can
+       * do about it if we're pinning so we just throw an error and hope no
+       * app is actually that stupid.
+       */
+      if ((new_flags & EXEC_OBJECT_PINNED) &&
+          (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
+          (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported on two different heaps");
+      }
+
+      if (bo->has_client_visible_address !=
+          ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported with and without buffer "
+                          "device address");
+      }
+
+      if (client_address && client_address != intel_48b_address(bo->offset)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported at two different "
+                          "addresses");
+      }
+
+      bo->flags = new_flags;
+
+      __sync_fetch_and_add(&bo->refcount, 1);
+   } else {
+      off_t size = lseek(fd, 0, SEEK_END);
+      if (size == (off_t)-1) {
+         anv_gem_close(device, gem_handle);
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      }
+
+      struct anv_bo new_bo = {
+         .name = "imported",
+         .gem_handle = gem_handle,
+         .refcount = 1,
+         .offset = -1,
+         .size = size,
+         .flags = bo_flags,
+         .is_external = true,
+         .has_client_visible_address =
+            (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+      };
+
+      if (anv_bo_is_pinned(&new_bo)) {
+         assert(new_bo._ccs_size == 0);
+         VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                                     alloc_flags,
+                                                     client_address);
+         if (result != VK_SUCCESS) {
+            pthread_mutex_unlock(&cache->mutex);
+            return result;
+         }
+      } else {
+         assert(!new_bo.has_client_visible_address);
+      }
+
+      *bo = new_bo;
+   }
+
+   pthread_mutex_unlock(&cache->mutex);
+   *bo_out = bo;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_export_bo(struct anv_device *device,
+                     struct anv_bo *bo, int *fd_out)
+{
+   assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
+
+   /* This BO must have been flagged external in order for us to be able
+    * to export it.  This is done based on external options passed into
+    * anv_AllocateMemory.
+    */
+   assert(bo->is_external);
+
+   int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
+   if (fd < 0)
+      return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
+
+   *fd_out = fd;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_get_bo_tiling(struct anv_device *device,
+                         struct anv_bo *bo,
+                         enum isl_tiling *tiling_out)
+{
+   int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
+   if (i915_tiling < 0) {
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                       "failed to get BO tiling: %m");
+   }
+
+   *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_set_bo_tiling(struct anv_device *device,
+                         struct anv_bo *bo,
+                         uint32_t row_pitch_B,
+                         enum isl_tiling tiling)
+{
+   int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
+                                isl_tiling_to_i915_tiling(tiling));
+   if (ret) {
+      return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "failed to set BO tiling: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+static bool
+atomic_dec_not_one(uint32_t *counter)
+{
+   uint32_t old, val;
+
+   val = *counter;
+   while (1) {
+      if (val == 1)
+         return false;
+
+      old = __sync_val_compare_and_swap(counter, val, val - 1);
+      if (old == val)
+         return true;
+
+      val = old;
+   }
+}
+
+void
+anv_device_release_bo(struct anv_device *device,
+                      struct anv_bo *bo)
+{
+   struct anv_bo_cache *cache = &device->bo_cache;
+   assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
+
+   /* Try to decrement the counter but don't go below one.  If this succeeds
+    * then the refcount has been decremented and we are not the last
+    * reference.
+    */
+   if (atomic_dec_not_one(&bo->refcount))
+      return;
+
+   pthread_mutex_lock(&cache->mutex);
+
+   /* We are probably the last reference since our attempt to decrement above
+    * failed.  However, we can't actually know until we are inside the mutex.
+    * Otherwise, someone could import the BO between the decrement and our
+    * taking the mutex.
+    */
+   if (unlikely(__sync_sub_and_fetch(&bo->refcount, 1) > 0)) {
+      /* Turns out we're not the last reference.  Unlock and bail. */
+      pthread_mutex_unlock(&cache->mutex);
+      return;
+   }
+   assert(bo->refcount == 0);
+
+   if (bo->_ccs_size > 0) {
+      assert(device->physical->has_implicit_ccs);
+      assert(device->info->has_aux_map);
+      assert(bo->has_implicit_ccs);
+      intel_aux_map_unmap_range(device->aux_map_ctx,
+                                intel_canonical_address(bo->offset),
+                                bo->size);
+   }
+
+   /* Memset the BO just in case.  The refcount being zero should be enough to
+    * prevent someone from assuming the data is valid but it's safer to just
+    * stomp to zero just in case.  We explicitly do this *before* we actually
+    * close the GEM handle to ensure that if anyone allocates something and
+    * gets the same GEM handle, the memset has already happen and won't stomp
+    * all over any data they may write in this BO.
+    */
+   struct anv_bo old_bo = *bo;
+
+   memset(bo, 0, sizeof(*bo));
+
+   anv_bo_finish(device, &old_bo);
+
+   /* Don't unlock until we've actually closed the BO.  The whole point of
+    * the BO cache is to ensure that we correctly handle races with creating
+    * and releasing GEM handles and we don't want to let someone import the BO
+    * again between mutex unlock and closing the GEM handle.
+    */
+   pthread_mutex_unlock(&cache->mutex);
+}
diff --git a/src/intel/vulkan_hasvk/anv_android.c b/src/intel/vulkan_hasvk/anv_android.c
new file mode 100644
index 00000000000..8a17f0a2454
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_android.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright © 2017, Google Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <hardware/gralloc.h>
+
+#if ANDROID_API_LEVEL >= 26
+#include <hardware/gralloc1.h>
+#endif
+
+#include <hardware/hardware.h>
+#include <hardware/hwvulkan.h>
+#include <vulkan/vk_android_native_buffer.h>
+#include <vulkan/vk_icd.h>
+#include <sync/sync.h>
+
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+#include "vk_util.h"
+
+static int anv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev);
+static int anv_hal_close(struct hw_device_t *dev);
+
+static void UNUSED
+static_asserts(void)
+{
+   STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC);
+}
+
+PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
+   .common = {
+      .tag = HARDWARE_MODULE_TAG,
+      .module_api_version = HWVULKAN_MODULE_API_VERSION_0_1,
+      .hal_api_version = HARDWARE_MAKE_API_VERSION(1, 0),
+      .id = HWVULKAN_HARDWARE_MODULE_ID,
+      .name = "Intel Vulkan HAL",
+      .author = "Intel",
+      .methods = &(hw_module_methods_t) {
+         .open = anv_hal_open,
+      },
+   },
+};
+
+/* If any bits in test_mask are set, then unset them and return true. */
+static inline bool
+unmask32(uint32_t *inout_mask, uint32_t test_mask)
+{
+   uint32_t orig_mask = *inout_mask;
+   *inout_mask &= ~test_mask;
+   return *inout_mask != orig_mask;
+}
+
+static int
+anv_hal_open(const struct hw_module_t* mod, const char* id,
+             struct hw_device_t** dev)
+{
+   assert(mod == &HAL_MODULE_INFO_SYM.common);
+   assert(strcmp(id, HWVULKAN_DEVICE_0) == 0);
+
+   hwvulkan_device_t *hal_dev = malloc(sizeof(*hal_dev));
+   if (!hal_dev)
+      return -1;
+
+   *hal_dev = (hwvulkan_device_t) {
+      .common = {
+         .tag = HARDWARE_DEVICE_TAG,
+         .version = HWVULKAN_DEVICE_API_VERSION_0_1,
+         .module = &HAL_MODULE_INFO_SYM.common,
+         .close = anv_hal_close,
+      },
+     .EnumerateInstanceExtensionProperties = anv_EnumerateInstanceExtensionProperties,
+     .CreateInstance = anv_CreateInstance,
+     .GetInstanceProcAddr = anv_GetInstanceProcAddr,
+   };
+
+   *dev = &hal_dev->common;
+   return 0;
+}
+
+static int
+anv_hal_close(struct hw_device_t *dev)
+{
+   /* hwvulkan.h claims that hw_device_t::close() is never called. */
+   return -1;
+}
+
+#if ANDROID_API_LEVEL >= 26
+#include <vndk/hardware_buffer.h>
+/* See i915_private_android_types.h in minigbm. */
+#define HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL 0x100
+
+enum {
+   /* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */
+   BUFFER_USAGE_CAMERA_MASK = 0x00060000U,
+};
+
+inline VkFormat
+vk_format_from_android(unsigned android_format, unsigned android_usage)
+{
+   switch (android_format) {
+   case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM:
+      return VK_FORMAT_R8G8B8A8_UNORM;
+   case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
+   case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
+      return VK_FORMAT_R8G8B8_UNORM;
+   case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM:
+      return VK_FORMAT_R5G6B5_UNORM_PACK16;
+   case AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT:
+      return VK_FORMAT_R16G16B16A16_SFLOAT;
+   case AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM:
+      return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+   case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420:
+   case HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL:
+      return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+   case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED:
+      if (android_usage & BUFFER_USAGE_CAMERA_MASK)
+         return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+      else
+         return VK_FORMAT_R8G8B8_UNORM;
+   case AHARDWAREBUFFER_FORMAT_BLOB:
+   default:
+      return VK_FORMAT_UNDEFINED;
+   }
+}
+
+static inline unsigned
+android_format_from_vk(unsigned vk_format)
+{
+   switch (vk_format) {
+   case VK_FORMAT_R8G8B8A8_UNORM:
+      return AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
+   case VK_FORMAT_R8G8B8_UNORM:
+      return AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM;
+   case VK_FORMAT_R5G6B5_UNORM_PACK16:
+      return AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM;
+   case VK_FORMAT_R16G16B16A16_SFLOAT:
+      return AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT;
+   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+      return AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM;
+   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+#ifdef HAVE_CROS_GRALLOC
+      return AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420;
+#else
+      return HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL;
+#endif
+   default:
+      return AHARDWAREBUFFER_FORMAT_BLOB;
+   }
+}
+
+static VkFormatFeatureFlags
+features2_to_features(VkFormatFeatureFlags2 features2)
+{
+   return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS;
+}
+
+static VkResult
+get_ahw_buffer_format_properties2(
+   VkDevice device_h,
+   const struct AHardwareBuffer *buffer,
+   VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
+{
+   ANV_FROM_HANDLE(anv_device, device, device_h);
+
+   /* Get a description of buffer contents . */
+   AHardwareBuffer_Desc desc;
+   AHardwareBuffer_describe(buffer, &desc);
+
+   /* Verify description. */
+   uint64_t gpu_usage =
+      AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE |
+      AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT |
+      AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+
+   /* "Buffer must be a valid Android hardware buffer object with at least
+    * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags."
+    */
+   if (!(desc.usage & (gpu_usage)))
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   /* Fill properties fields based on description. */
+   VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
+
+   p->format = vk_format_from_android(desc.format, desc.usage);
+
+   const struct anv_format *anv_format = anv_get_format(p->format);
+   p->externalFormat = (uint64_t) (uintptr_t) anv_format;
+
+   /* Default to OPTIMAL tiling but set to linear in case
+    * of AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER usage.
+    */
+   VkImageTiling tiling = VK_IMAGE_TILING_OPTIMAL;
+
+   if (desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER)
+      tiling = VK_IMAGE_TILING_LINEAR;
+
+   p->formatFeatures =
+      anv_get_image_format_features2(device->info, p->format, anv_format,
+                                     tiling, NULL);
+
+   /* "Images can be created with an external format even if the Android hardware
+    *  buffer has a format which has an equivalent Vulkan format to enable
+    *  consistent handling of images from sources that might use either category
+    *  of format. However, all images created with an external format are subject
+    *  to the valid usage requirements associated with external formats, even if
+    *  the Android hardware buffer’s format has a Vulkan equivalent."
+    *
+    * "The formatFeatures member *must* include
+    *  VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT and at least one of
+    *  VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT or
+    *  VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT"
+    */
+   p->formatFeatures |=
+      VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+
+   /* "Implementations may not always be able to determine the color model,
+    *  numerical range, or chroma offsets of the image contents, so the values
+    *  in VkAndroidHardwareBufferFormatPropertiesANDROID are only suggestions.
+    *  Applications should treat these values as sensible defaults to use in
+    *  the absence of more reliable information obtained through some other
+    *  means."
+    */
+   p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+   p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+   p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+
+   p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+   p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_GetAndroidHardwareBufferPropertiesANDROID(
+   VkDevice device_h,
+   const struct AHardwareBuffer *buffer,
+   VkAndroidHardwareBufferPropertiesANDROID *pProperties)
+{
+   ANV_FROM_HANDLE(anv_device, dev, device_h);
+
+   VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
+      vk_find_struct(pProperties->pNext,
+                     ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
+   /* Fill format properties of an Android hardware buffer. */
+   if (format_prop) {
+      VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+         .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+      };
+      get_ahw_buffer_format_properties2(device_h, buffer, &format_prop2);
+
+      format_prop->format                 = format_prop2.format;
+      format_prop->externalFormat         = format_prop2.externalFormat;
+      format_prop->formatFeatures         =
+         features2_to_features(format_prop2.formatFeatures);
+      format_prop->samplerYcbcrConversionComponents =
+         format_prop2.samplerYcbcrConversionComponents;
+      format_prop->suggestedYcbcrModel    = format_prop2.suggestedYcbcrModel;
+      format_prop->suggestedYcbcrRange    = format_prop2.suggestedYcbcrRange;
+      format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+      format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+   }
+
+   VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+      vk_find_struct(pProperties->pNext,
+                     ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+   if (format_prop2)
+      get_ahw_buffer_format_properties2(device_h, buffer, format_prop2);
+
+   /* NOTE - We support buffers with only one handle but do not error on
+    * multiple handle case. Reason is that we want to support YUV formats
+    * where we have many logical planes but they all point to the same
+    * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM.
+    */
+   const native_handle_t *handle =
+      AHardwareBuffer_getNativeHandle(buffer);
+   int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1;
+   if (dma_buf < 0)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   /* All memory types. */
+   uint32_t memory_types = (1ull << dev->physical->memory.type_count) - 1;
+
+   pProperties->allocationSize = lseek(dma_buf, 0, SEEK_END);
+   pProperties->memoryTypeBits = memory_types;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_GetMemoryAndroidHardwareBufferANDROID(
+   VkDevice device_h,
+   const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo,
+   struct AHardwareBuffer **pBuffer)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, pInfo->memory);
+
+   /* Some quotes from Vulkan spec:
+    *
+    * "If the device memory was created by importing an Android hardware
+    * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same
+    * Android hardware buffer object."
+    *
+    * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must
+    * have been included in VkExportMemoryAllocateInfo::handleTypes when
+    * memory was created."
+    */
+   if (mem->ahw) {
+      *pBuffer = mem->ahw;
+      /* Increase refcount. */
+      AHardwareBuffer_acquire(mem->ahw);
+      return VK_SUCCESS;
+   }
+
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
+}
+
+#endif
+
+/* Construct ahw usage mask from image usage bits, see
+ * 'AHardwareBuffer Usage Equivalence' in Vulkan spec.
+ */
+uint64_t
+anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
+                            const VkImageUsageFlags vk_usage)
+{
+   uint64_t ahw_usage = 0;
+#if ANDROID_API_LEVEL >= 26
+   if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
+      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
+
+   if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
+      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
+
+   if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
+      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT;
+
+   if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
+      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP;
+
+   if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT)
+      ahw_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT;
+
+   /* No usage bits set - set at least one GPU usage. */
+   if (ahw_usage == 0)
+      ahw_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
+#endif
+   return ahw_usage;
+}
+
+/*
+ * Called from anv_AllocateMemory when import AHardwareBuffer.
+ */
+VkResult
+anv_import_ahw_memory(VkDevice device_h,
+                      struct anv_device_memory *mem,
+                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+{
+#if ANDROID_API_LEVEL >= 26
+   ANV_FROM_HANDLE(anv_device, device, device_h);
+
+   /* Import from AHardwareBuffer to anv_device_memory. */
+   const native_handle_t *handle =
+      AHardwareBuffer_getNativeHandle(info->buffer);
+
+   /* NOTE - We support buffers with only one handle but do not error on
+    * multiple handle case. Reason is that we want to support YUV formats
+    * where we have many logical planes but they all point to the same
+    * buffer, like is the case with VK_FORMAT_G8_B8R8_2PLANE_420_UNORM.
+    */
+   int dma_buf = (handle && handle->numFds) ? handle->data[0] : -1;
+   if (dma_buf < 0)
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   VkResult result = anv_device_import_bo(device, dma_buf, 0,
+                                          0 /* client_address */,
+                                          &mem->bo);
+   assert(result == VK_SUCCESS);
+
+   /* "If the vkAllocateMemory command succeeds, the implementation must
+    * acquire a reference to the imported hardware buffer, which it must
+    * release when the device memory object is freed. If the command fails,
+    * the implementation must not retain a reference."
+    */
+   AHardwareBuffer_acquire(info->buffer);
+   mem->ahw = info->buffer;
+
+   return VK_SUCCESS;
+#else
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+#endif
+}
+
+VkResult
+anv_create_ahw_memory(VkDevice device_h,
+                      struct anv_device_memory *mem,
+                      const VkMemoryAllocateInfo *pAllocateInfo)
+{
+#if ANDROID_API_LEVEL >= 26
+   const VkMemoryDedicatedAllocateInfo *dedicated_info =
+      vk_find_struct_const(pAllocateInfo->pNext,
+                           MEMORY_DEDICATED_ALLOCATE_INFO);
+
+   uint32_t w = 0;
+   uint32_t h = 1;
+   uint32_t layers = 1;
+   uint32_t format = 0;
+   uint64_t usage = 0;
+
+   /* If caller passed dedicated information. */
+   if (dedicated_info && dedicated_info->image) {
+      ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
+      w = image->vk.extent.width;
+      h = image->vk.extent.height;
+      layers = image->vk.array_layers;
+      format = android_format_from_vk(image->vk.format);
+      usage = anv_ahw_usage_from_vk_usage(image->vk.create_flags, image->vk.usage);
+   } else if (dedicated_info && dedicated_info->buffer) {
+      ANV_FROM_HANDLE(anv_buffer, buffer, dedicated_info->buffer);
+      w = buffer->vk.size;
+      format = AHARDWAREBUFFER_FORMAT_BLOB;
+      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
+              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
+   } else {
+      w = pAllocateInfo->allocationSize;
+      format = AHARDWAREBUFFER_FORMAT_BLOB;
+      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
+              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
+   }
+
+   struct AHardwareBuffer *ahw = NULL;
+   struct AHardwareBuffer_Desc desc = {
+      .width = w,
+      .height = h,
+      .layers = layers,
+      .format = format,
+      .usage = usage,
+    };
+
+   if (AHardwareBuffer_allocate(&desc, &ahw) != 0)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   const VkImportAndroidHardwareBufferInfoANDROID import_info = {
+      .buffer = ahw,
+   };
+   VkResult result = anv_import_ahw_memory(device_h, mem, &import_info);
+
+   /* Release a reference to avoid leak for AHB allocation. */
+   AHardwareBuffer_release(ahw);
+
+   return result;
+#else
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+#endif
+
+}
+
+VkResult
+anv_image_init_from_gralloc(struct anv_device *device,
+                            struct anv_image *image,
+                            const VkImageCreateInfo *base_info,
+                            const VkNativeBufferANDROID *gralloc_info)
+{
+   struct anv_bo *bo = NULL;
+   VkResult result;
+
+   struct anv_image_create_info anv_info = {
+      .vk_info = base_info,
+      .isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT,
+   };
+
+   if (gralloc_info->handle->numFds != 1) {
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                       "VkNativeBufferANDROID::handle::numFds is %d, "
+                       "expected 1", gralloc_info->handle->numFds);
+   }
+
+   /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
+    * must exceed that of the gralloc handle, and we do not own the gralloc
+    * handle.
+    */
+   int dma_buf = gralloc_info->handle->data[0];
+
+   /* We need to set the WRITE flag on window system buffers so that GEM will
+    * know we're writing to them and synchronize uses on other rings (for
+    * example, if the display server uses the blitter ring).
+    *
+    * If this function fails and if the imported bo was resident in the cache,
+    * we should avoid updating the bo's flags. Therefore, we defer updating
+    * the flags until success is certain.
+    *
+    */
+   result = anv_device_import_bo(device, dma_buf,
+                                 ANV_BO_ALLOC_IMPLICIT_SYNC |
+                                 ANV_BO_ALLOC_IMPLICIT_WRITE,
+                                 0 /* client_address */,
+                                 &bo);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "failed to import dma-buf from VkNativeBufferANDROID");
+   }
+
+   enum isl_tiling tiling;
+   result = anv_device_get_bo_tiling(device, bo, &tiling);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "failed to get tiling from VkNativeBufferANDROID");
+   }
+   anv_info.isl_tiling_flags = 1u << tiling;
+
+   enum isl_format format = anv_get_isl_format(device->info,
+                                               base_info->format,
+                                               VK_IMAGE_ASPECT_COLOR_BIT,
+                                               base_info->tiling);
+   assert(format != ISL_FORMAT_UNSUPPORTED);
+
+   result = anv_image_init(device, image, &anv_info);
+   if (result != VK_SUCCESS)
+      goto fail_init;
+
+   VkMemoryRequirements2 mem_reqs = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+   };
+
+   anv_image_get_memory_requirements(device, image, image->vk.aspects,
+                                     &mem_reqs);
+
+   VkDeviceSize aligned_image_size =
+      align_u64(mem_reqs.memoryRequirements.size,
+                mem_reqs.memoryRequirements.alignment);
+
+   if (bo->size < aligned_image_size) {
+      result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                         "dma-buf from VkNativeBufferANDROID is too small for "
+                         "VkImage: %"PRIu64"B < %"PRIu64"B",
+                         bo->size, aligned_image_size);
+      goto fail_size;
+   }
+
+   assert(!image->disjoint);
+   assert(image->n_planes == 1);
+   assert(image->planes[0].primary_surface.memory_range.binding ==
+          ANV_IMAGE_MEMORY_BINDING_MAIN);
+   assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo == NULL);
+   assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.offset == 0);
+   image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo;
+   image->from_gralloc = true;
+
+   return VK_SUCCESS;
+
+ fail_size:
+   anv_image_finish(image);
+ fail_init:
+   anv_device_release_bo(device, bo);
+
+   return result;
+}
+
+VkResult
+anv_image_bind_from_gralloc(struct anv_device *device,
+                            struct anv_image *image,
+                            const VkNativeBufferANDROID *gralloc_info)
+{
+   /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
+    * must exceed that of the gralloc handle, and we do not own the gralloc
+    * handle.
+    */
+   int dma_buf = gralloc_info->handle->data[0];
+
+   /* We need to set the WRITE flag on window system buffers so that GEM will
+    * know we're writing to them and synchronize uses on other rings (for
+    * example, if the display server uses the blitter ring).
+    *
+    * If this function fails and if the imported bo was resident in the cache,
+    * we should avoid updating the bo's flags. Therefore, we defer updating
+    * the flags until success is certain.
+    *
+    */
+   struct anv_bo *bo = NULL;
+   VkResult result = anv_device_import_bo(device, dma_buf,
+                                          ANV_BO_ALLOC_IMPLICIT_SYNC |
+                                          ANV_BO_ALLOC_IMPLICIT_WRITE,
+                                          0 /* client_address */,
+                                          &bo);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "failed to import dma-buf from VkNativeBufferANDROID");
+   }
+
+   uint64_t img_size = image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.size;
+   if (img_size < bo->size) {
+      result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                         "dma-buf from VkNativeBufferANDROID is too small for "
+                         "VkImage: %"PRIu64"B < %"PRIu64"B",
+                         bo->size, img_size);
+      anv_device_release_bo(device, bo);
+      return result;
+   }
+
+   assert(!image->disjoint);
+   assert(image->n_planes == 1);
+   assert(image->planes[0].primary_surface.memory_range.binding ==
+          ANV_IMAGE_MEMORY_BINDING_MAIN);
+   assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo == NULL);
+   assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.offset == 0);
+   image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo;
+   image->from_gralloc = true;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+format_supported_with_usage(VkDevice device_h, VkFormat format,
+                            VkImageUsageFlags imageUsage)
+{
+   ANV_FROM_HANDLE(anv_device, device, device_h);
+   VkPhysicalDevice phys_dev_h = anv_physical_device_to_handle(device->physical);
+   VkResult result;
+
+   const VkPhysicalDeviceImageFormatInfo2 image_format_info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .format = format,
+      .type = VK_IMAGE_TYPE_2D,
+      .tiling = VK_IMAGE_TILING_OPTIMAL,
+      .usage = imageUsage,
+   };
+
+   VkImageFormatProperties2 image_format_props = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2,
+   };
+
+   /* Check that requested format and usage are supported. */
+   result = anv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h,
+               &image_format_info, &image_format_props);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "anv_GetPhysicalDeviceImageFormatProperties2 failed "
+                       "inside %s", __func__);
+   }
+   return VK_SUCCESS;
+}
+
+
+static VkResult
+setup_gralloc0_usage(struct anv_device *device, VkFormat format,
+                     VkImageUsageFlags imageUsage, int *grallocUsage)
+{
+   /* WARNING: Android's libvulkan.so hardcodes the VkImageUsageFlags
+    * returned to applications via VkSurfaceCapabilitiesKHR::supportedUsageFlags.
+    * The relevant code in libvulkan/swapchain.cpp contains this fun comment:
+    *
+    *     TODO(jessehall): I think these are right, but haven't thought hard
+    *     about it. Do we need to query the driver for support of any of
+    *     these?
+    *
+    * Any disagreement between this function and the hardcoded
+    * VkSurfaceCapabilitiesKHR:supportedUsageFlags causes tests
+    * dEQP-VK.wsi.android.swapchain.*.image_usage to fail.
+    */
+
+   if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                             VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT))
+      *grallocUsage |= GRALLOC_USAGE_HW_RENDER;
+
+   if (unmask32(&imageUsage, VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                             VK_IMAGE_USAGE_SAMPLED_BIT |
+                             VK_IMAGE_USAGE_STORAGE_BIT |
+                             VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
+      *grallocUsage |= GRALLOC_USAGE_HW_TEXTURE;
+
+   /* All VkImageUsageFlags not explicitly checked here are unsupported for
+    * gralloc swapchains.
+    */
+   if (imageUsage != 0) {
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                       "unsupported VkImageUsageFlags(0x%x) for gralloc "
+                       "swapchain", imageUsage);
+   }
+
+   /* The below formats support GRALLOC_USAGE_HW_FB (that is, display
+    * scanout). This short list of formats is univserally supported on Intel
+    * but is incomplete.  The full set of supported formats is dependent on
+    * kernel and hardware.
+    *
+    * FINISHME: Advertise all display-supported formats.
+    */
+   switch (format) {
+      case VK_FORMAT_B8G8R8A8_UNORM:
+      case VK_FORMAT_R5G6B5_UNORM_PACK16:
+      case VK_FORMAT_R8G8B8A8_UNORM:
+      case VK_FORMAT_R8G8B8A8_SRGB:
+         *grallocUsage |= GRALLOC_USAGE_HW_FB |
+                          GRALLOC_USAGE_HW_COMPOSER |
+                          GRALLOC_USAGE_EXTERNAL_DISP;
+         break;
+      default:
+         mesa_logw("%s: unsupported format=%d", __func__, format);
+   }
+
+   if (*grallocUsage == 0)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   return VK_SUCCESS;
+}
+
+#if ANDROID_API_LEVEL >= 26
+VkResult anv_GetSwapchainGrallocUsage2ANDROID(
+    VkDevice            device_h,
+    VkFormat            format,
+    VkImageUsageFlags   imageUsage,
+    VkSwapchainImageUsageFlagsANDROID swapchainImageUsage,
+    uint64_t*           grallocConsumerUsage,
+    uint64_t*           grallocProducerUsage)
+{
+   ANV_FROM_HANDLE(anv_device, device, device_h);
+   VkResult result;
+
+   *grallocConsumerUsage = 0;
+   *grallocProducerUsage = 0;
+   mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+
+   result = format_supported_with_usage(device_h, format, imageUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   int32_t grallocUsage = 0;
+   result = setup_gralloc0_usage(device, format, imageUsage, &grallocUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Setup gralloc1 usage flags from gralloc0 flags. */
+
+   if (grallocUsage & GRALLOC_USAGE_HW_RENDER) {
+      *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_CLIENT_TARGET;
+   }
+
+   if (grallocUsage & GRALLOC_USAGE_HW_TEXTURE) {
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_GPU_TEXTURE;
+   }
+
+   if (grallocUsage & (GRALLOC_USAGE_HW_FB |
+                       GRALLOC_USAGE_HW_COMPOSER |
+                       GRALLOC_USAGE_EXTERNAL_DISP)) {
+      *grallocProducerUsage |= GRALLOC1_PRODUCER_USAGE_GPU_RENDER_TARGET;
+      *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
+   }
+
+   return VK_SUCCESS;
+}
+#endif
+
+VkResult anv_GetSwapchainGrallocUsageANDROID(
+    VkDevice            device_h,
+    VkFormat            format,
+    VkImageUsageFlags   imageUsage,
+    int*                grallocUsage)
+{
+   ANV_FROM_HANDLE(anv_device, device, device_h);
+   VkResult result;
+
+   *grallocUsage = 0;
+   mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+
+   result = format_supported_with_usage(device_h, format, imageUsage);
+   if (result != VK_SUCCESS)
+      return result;
+
+   return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
+}
diff --git a/src/intel/vulkan_hasvk/anv_android.h b/src/intel/vulkan_hasvk/anv_android.h
new file mode 100644
index 00000000000..4490d3b2437
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_android.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_ANDROID_H
+#define ANV_ANDROID_H
+
+#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
+#include <vndk/hardware_buffer.h>
+#endif
+#include <vulkan/vulkan.h>
+#include <vulkan/vulkan_android.h>
+#include <vulkan/vk_android_native_buffer.h>
+
+struct anv_device_memory;
+struct anv_device;
+struct anv_image;
+
+VkResult anv_image_init_from_gralloc(struct anv_device *device,
+                                     struct anv_image *image,
+                                     const VkImageCreateInfo *base_info,
+                                     const VkNativeBufferANDROID *gralloc_info);
+
+VkResult anv_image_bind_from_gralloc(struct anv_device *device,
+                                     struct anv_image *image,
+                                     const VkNativeBufferANDROID *gralloc_info);
+
+uint64_t anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
+                                     const VkImageUsageFlags vk_usage);
+
+VkResult anv_import_ahw_memory(VkDevice device_h,
+                               struct anv_device_memory *mem,
+                               const VkImportAndroidHardwareBufferInfoANDROID *info);
+
+VkResult anv_create_ahw_memory(VkDevice device_h,
+                               struct anv_device_memory *mem,
+                               const VkMemoryAllocateInfo *pAllocateInfo);
+#endif /* ANV_ANDROID_H */
diff --git a/src/intel/vulkan_hasvk/anv_android_stubs.c b/src/intel/vulkan_hasvk/anv_android_stubs.c
new file mode 100644
index 00000000000..d5bc11949ab
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_android_stubs.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_android.h"
+
+VkResult
+anv_image_init_from_gralloc(struct anv_device *device,
+                            struct anv_image *image,
+                            const VkImageCreateInfo *base_info,
+                            const VkNativeBufferANDROID *gralloc_info)
+{
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+}
+
+VkResult anv_image_bind_from_gralloc(struct anv_device *device,
+                                     struct anv_image *image,
+                                     const VkNativeBufferANDROID *gralloc_info)
+{
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+}
+
+uint64_t
+anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
+                            const VkImageUsageFlags vk_usage)
+{
+   return 0;
+}
+
+VkResult
+anv_import_ahw_memory(VkDevice device_h,
+                      struct anv_device_memory *mem,
+                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+{
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+}
+
+VkResult
+anv_create_ahw_memory(VkDevice device_h,
+                      struct anv_device_memory *mem,
+                      const VkMemoryAllocateInfo *pAllocateInfo)
+{
+   return VK_ERROR_EXTENSION_NOT_PRESENT;
+}
diff --git a/src/intel/vulkan_hasvk/anv_batch_chain.c b/src/intel/vulkan_hasvk/anv_batch_chain.c
new file mode 100644
index 00000000000..459747e0a29
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_batch_chain.c
@@ -0,0 +1,2477 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <xf86drm.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+
+#include "genxml/gen8_pack.h"
+#include "genxml/genX_bits.h"
+#include "perf/intel_perf.h"
+
+#include "util/debug.h"
+#include "util/perf/u_trace.h"
+
+/** \file anv_batch_chain.c
+ *
+ * This file contains functions related to anv_cmd_buffer as a data
+ * structure.  This involves everything required to create and destroy
+ * the actual batch buffers as well as link them together and handle
+ * relocations and surface state.  It specifically does *not* contain any
+ * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
+ */
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_reloc_list
+ *-----------------------------------------------------------------------*/
+
+VkResult
+anv_reloc_list_init(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc)
+{
+   memset(list, 0, sizeof(*list));
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_reloc_list_init_clone(struct anv_reloc_list *list,
+                          const VkAllocationCallbacks *alloc,
+                          const struct anv_reloc_list *other_list)
+{
+   list->num_relocs = other_list->num_relocs;
+   list->array_length = other_list->array_length;
+
+   if (list->num_relocs > 0) {
+      list->relocs =
+         vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (list->relocs == NULL)
+         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      list->reloc_bos =
+         vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (list->reloc_bos == NULL) {
+         vk_free(alloc, list->relocs);
+         return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      memcpy(list->relocs, other_list->relocs,
+             list->array_length * sizeof(*list->relocs));
+      memcpy(list->reloc_bos, other_list->reloc_bos,
+             list->array_length * sizeof(*list->reloc_bos));
+   } else {
+      list->relocs = NULL;
+      list->reloc_bos = NULL;
+   }
+
+   list->dep_words = other_list->dep_words;
+
+   if (list->dep_words > 0) {
+      list->deps =
+         vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8,
+                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      memcpy(list->deps, other_list->deps,
+             list->dep_words * sizeof(BITSET_WORD));
+   } else {
+      list->deps = NULL;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_reloc_list_finish(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc)
+{
+   vk_free(alloc, list->relocs);
+   vk_free(alloc, list->reloc_bos);
+   vk_free(alloc, list->deps);
+}
+
+static VkResult
+anv_reloc_list_grow(struct anv_reloc_list *list,
+                    const VkAllocationCallbacks *alloc,
+                    size_t num_additional_relocs)
+{
+   if (list->num_relocs + num_additional_relocs <= list->array_length)
+      return VK_SUCCESS;
+
+   size_t new_length = MAX2(16, list->array_length * 2);
+   while (new_length < list->num_relocs + num_additional_relocs)
+      new_length *= 2;
+
+   struct drm_i915_gem_relocation_entry *new_relocs =
+      vk_realloc(alloc, list->relocs,
+                 new_length * sizeof(*list->relocs), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_relocs == NULL)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+   list->relocs = new_relocs;
+
+   struct anv_bo **new_reloc_bos =
+      vk_realloc(alloc, list->reloc_bos,
+                 new_length * sizeof(*list->reloc_bos), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_reloc_bos == NULL)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+   list->reloc_bos = new_reloc_bos;
+
+   list->array_length = new_length;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_reloc_list_grow_deps(struct anv_reloc_list *list,
+                         const VkAllocationCallbacks *alloc,
+                         uint32_t min_num_words)
+{
+   if (min_num_words <= list->dep_words)
+      return VK_SUCCESS;
+
+   uint32_t new_length = MAX2(32, list->dep_words * 2);
+   while (new_length < min_num_words)
+      new_length *= 2;
+
+   BITSET_WORD *new_deps =
+      vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (new_deps == NULL)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+   list->deps = new_deps;
+
+   /* Zero out the new data */
+   memset(list->deps + list->dep_words, 0,
+          (new_length - list->dep_words) * sizeof(BITSET_WORD));
+   list->dep_words = new_length;
+
+   return VK_SUCCESS;
+}
+
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+VkResult
+anv_reloc_list_add_bo(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc,
+                      struct anv_bo *target_bo)
+{
+   assert(!target_bo->is_wrapper);
+   assert(anv_bo_is_pinned(target_bo));
+
+   uint32_t idx = target_bo->gem_handle;
+   VkResult result = anv_reloc_list_grow_deps(list, alloc,
+                                              (idx / BITSET_WORDBITS) + 1);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   BITSET_SET(list->deps, idx);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_reloc_list_add(struct anv_reloc_list *list,
+                   const VkAllocationCallbacks *alloc,
+                   uint32_t offset, struct anv_bo *target_bo, uint32_t delta,
+                   uint64_t *address_u64_out)
+{
+   struct drm_i915_gem_relocation_entry *entry;
+   int index;
+
+   struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo);
+   uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset);
+   if (address_u64_out)
+      *address_u64_out = target_bo_offset + delta;
+
+   assert(unwrapped_target_bo->gem_handle > 0);
+   assert(unwrapped_target_bo->refcount > 0);
+
+   if (anv_bo_is_pinned(unwrapped_target_bo))
+      return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo);
+
+   VkResult result = anv_reloc_list_grow(list, alloc, 1);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
+   index = list->num_relocs++;
+   list->reloc_bos[index] = target_bo;
+   entry = &list->relocs[index];
+   entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */
+   entry->delta = delta;
+   entry->offset = offset;
+   entry->presumed_offset = target_bo_offset;
+   entry->read_domains = 0;
+   entry->write_domain = 0;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_reloc_list_clear(struct anv_reloc_list *list)
+{
+   list->num_relocs = 0;
+   if (list->dep_words > 0)
+      memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
+}
+
+static VkResult
+anv_reloc_list_append(struct anv_reloc_list *list,
+                      const VkAllocationCallbacks *alloc,
+                      struct anv_reloc_list *other, uint32_t offset)
+{
+   VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (other->num_relocs > 0) {
+      memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
+             other->num_relocs * sizeof(other->relocs[0]));
+      memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
+             other->num_relocs * sizeof(other->reloc_bos[0]));
+
+      for (uint32_t i = 0; i < other->num_relocs; i++)
+         list->relocs[i + list->num_relocs].offset += offset;
+
+      list->num_relocs += other->num_relocs;
+   }
+
+   anv_reloc_list_grow_deps(list, alloc, other->dep_words);
+   for (uint32_t w = 0; w < other->dep_words; w++)
+      list->deps[w] |= other->deps[w];
+
+   return VK_SUCCESS;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch
+ *-----------------------------------------------------------------------*/
+
+void *
+anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
+{
+   if (batch->next + num_dwords * 4 > batch->end) {
+      VkResult result = batch->extend_cb(batch, batch->user_data);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(batch, result);
+         return NULL;
+      }
+   }
+
+   void *p = batch->next;
+
+   batch->next += num_dwords * 4;
+   assert(batch->next <= batch->end);
+
+   return p;
+}
+
+struct anv_address
+anv_batch_address(struct anv_batch *batch, void *batch_location)
+{
+   assert(batch->start <= batch_location);
+
+   /* Allow a jump at the current location of the batch. */
+   assert(batch->next >= batch_location);
+
+   return anv_address_add(batch->start_addr, batch_location - batch->start);
+}
+
+void
+anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
+{
+   uint32_t size, offset;
+
+   size = other->next - other->start;
+   assert(size % 4 == 0);
+
+   if (batch->next + size > batch->end) {
+      VkResult result = batch->extend_cb(batch, batch->user_data);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(batch, result);
+         return;
+      }
+   }
+
+   assert(batch->next + size <= batch->end);
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
+   memcpy(batch->next, other->start, size);
+
+   offset = batch->next - batch->start;
+   VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc,
+                                           other->relocs, offset);
+   if (result != VK_SUCCESS) {
+      anv_batch_set_error(batch, result);
+      return;
+   }
+
+   batch->next += size;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch_bo
+ *-----------------------------------------------------------------------*/
+
+static VkResult
+anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
+                    uint32_t size,
+                    struct anv_batch_bo **bbo_out)
+{
+   VkResult result;
+
+   struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bbo == NULL)
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
+                              size, &bbo->bo);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc);
+   if (result != VK_SUCCESS)
+      goto fail_bo_alloc;
+
+   *bbo_out = bbo;
+
+   return VK_SUCCESS;
+
+ fail_bo_alloc:
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
+ fail_alloc:
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
+
+   return result;
+}
+
+static VkResult
+anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
+                   const struct anv_batch_bo *other_bbo,
+                   struct anv_batch_bo **bbo_out)
+{
+   VkResult result;
+
+   struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
+                                        8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (bbo == NULL)
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
+                              other_bbo->bo->size, &bbo->bo);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->vk.pool->alloc,
+                                      &other_bbo->relocs);
+   if (result != VK_SUCCESS)
+      goto fail_bo_alloc;
+
+   bbo->length = other_bbo->length;
+   memcpy(bbo->bo->map, other_bbo->bo->map, other_bbo->length);
+   *bbo_out = bbo;
+
+   return VK_SUCCESS;
+
+ fail_bo_alloc:
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
+ fail_alloc:
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
+
+   return result;
+}
+
+static void
+anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch,
+                   size_t batch_padding)
+{
+   anv_batch_set_storage(batch, (struct anv_address) { .bo = bbo->bo, },
+                         bbo->bo->map, bbo->bo->size - batch_padding);
+   batch->relocs = &bbo->relocs;
+   anv_reloc_list_clear(&bbo->relocs);
+}
+
+static void
+anv_batch_bo_continue(struct anv_batch_bo *bbo, struct anv_batch *batch,
+                      size_t batch_padding)
+{
+   batch->start_addr = (struct anv_address) { .bo = bbo->bo, };
+   batch->start = bbo->bo->map;
+   batch->next = bbo->bo->map + bbo->length;
+   batch->end = bbo->bo->map + bbo->bo->size - batch_padding;
+   batch->relocs = &bbo->relocs;
+}
+
+static void
+anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
+{
+   assert(batch->start == bbo->bo->map);
+   bbo->length = batch->next - batch->start;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
+}
+
+static VkResult
+anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo,
+                  struct anv_batch *batch, size_t additional,
+                  size_t batch_padding)
+{
+   assert(batch->start == bbo->bo->map);
+   bbo->length = batch->next - batch->start;
+
+   size_t new_size = bbo->bo->size;
+   while (new_size <= bbo->length + additional + batch_padding)
+      new_size *= 2;
+
+   if (new_size == bbo->bo->size)
+      return VK_SUCCESS;
+
+   struct anv_bo *new_bo;
+   VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
+                                       new_size, &new_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(new_bo->map, bbo->bo->map, bbo->length);
+
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
+
+   bbo->bo = new_bo;
+   anv_batch_bo_continue(bbo, batch, batch_padding);
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_batch_bo *prev_bbo,
+                  struct anv_batch_bo *next_bbo,
+                  uint32_t next_bbo_offset)
+{
+   const uint32_t bb_start_offset =
+      prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4;
+   ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
+
+   /* Make sure we're looking at a MI_BATCH_BUFFER_START */
+   assert(((*bb_start >> 29) & 0x07) == 0);
+   assert(((*bb_start >> 23) & 0x3f) == 49);
+
+   if (anv_use_relocations(cmd_buffer->device->physical)) {
+      uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1;
+      assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4);
+
+      prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo;
+      prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset;
+
+      /* Use a bogus presumed offset to force a relocation */
+      prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1;
+   } else {
+      assert(anv_bo_is_pinned(prev_bbo->bo));
+      assert(anv_bo_is_pinned(next_bbo->bo));
+
+      write_reloc(cmd_buffer->device,
+                  prev_bbo->bo->map + bb_start_offset + 4,
+                  next_bbo->bo->offset + next_bbo_offset, true);
+   }
+}
+
+static void
+anv_batch_bo_destroy(struct anv_batch_bo *bbo,
+                     struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->vk.pool->alloc);
+   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
+}
+
+static VkResult
+anv_batch_bo_list_clone(const struct list_head *list,
+                        struct anv_cmd_buffer *cmd_buffer,
+                        struct list_head *new_list)
+{
+   VkResult result = VK_SUCCESS;
+
+   list_inithead(new_list);
+
+   struct anv_batch_bo *prev_bbo = NULL;
+   list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
+      struct anv_batch_bo *new_bbo = NULL;
+      result = anv_batch_bo_clone(cmd_buffer, bbo, &new_bbo);
+      if (result != VK_SUCCESS)
+         break;
+      list_addtail(&new_bbo->link, new_list);
+
+      if (prev_bbo)
+         anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
+
+      prev_bbo = new_bbo;
+   }
+
+   if (result != VK_SUCCESS) {
+      list_for_each_entry_safe(struct anv_batch_bo, bbo, new_list, link) {
+         list_del(&bbo->link);
+         anv_batch_bo_destroy(bbo, cmd_buffer);
+      }
+   }
+
+   return result;
+}
+
+/*-----------------------------------------------------------------------*
+ * Functions related to anv_batch_bo
+ *-----------------------------------------------------------------------*/
+
+static struct anv_batch_bo *
+anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
+{
+   return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
+}
+
+struct anv_address
+anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device);
+   struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
+   return (struct anv_address) {
+      .bo = pool->block_pool.bo,
+      .offset = bt_block->offset - pool->start_offset,
+   };
+}
+
+static void
+emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_bo *bo, uint32_t offset)
+{
+   /* In gfx8+ the address field grew to two dwords to accommodate 48 bit
+    * offsets. The high 16 bits are in the last dword, so we can use the gfx8
+    * version in either case, as long as we set the instruction length in the
+    * header accordingly.  This means that we always emit three dwords here
+    * and all the padding and adjustment we do in this file works for all
+    * gens.
+    */
+
+#define GFX7_MI_BATCH_BUFFER_START_length      2
+#define GFX7_MI_BATCH_BUFFER_START_length_bias      2
+
+   const uint32_t gfx7_length =
+      GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias;
+   const uint32_t gfx8_length =
+      GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias;
+
+   anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) {
+      bbs.DWordLength               = cmd_buffer->device->info->ver < 8 ?
+                                      gfx7_length : gfx8_length;
+      bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
+      bbs.AddressSpaceIndicator     = ASI_PPGTT;
+      bbs.BatchBufferStartAddress   = (struct anv_address) { bo, offset };
+   }
+}
+
+static void
+cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_batch_bo *bbo)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_batch_bo *current_bbo =
+      anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   /* We set the end of the batch a little short so we would be sure we
+    * have room for the chaining command.  Since we're about to emit the
+    * chaining command, let's set it back where it should go.
+    */
+   batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+   assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
+
+   emit_batch_buffer_start(cmd_buffer, bbo->bo, 0);
+
+   anv_batch_bo_finish(current_bbo, batch);
+}
+
+static void
+anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
+                                   struct anv_cmd_buffer *cmd_buffer_to)
+{
+   assert(!anv_use_relocations(cmd_buffer_from->device->physical));
+
+   uint32_t *bb_start = cmd_buffer_from->batch_end;
+
+   struct anv_batch_bo *last_bbo =
+      list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
+   struct anv_batch_bo *first_bbo =
+      list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
+
+   struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = {
+      __anv_cmd_header(GFX8_MI_BATCH_BUFFER_START),
+      .SecondLevelBatchBuffer    = Firstlevelbatch,
+      .AddressSpaceIndicator     = ASI_PPGTT,
+      .BatchBufferStartAddress   = (struct anv_address) { first_bbo->bo, 0 },
+   };
+   struct anv_batch local_batch = {
+      .start  = last_bbo->bo->map,
+      .end    = last_bbo->bo->map + last_bbo->bo->size,
+      .relocs = &last_bbo->relocs,
+      .alloc  = &cmd_buffer_from->vk.pool->alloc,
+   };
+
+   __anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
+
+   last_bbo->chained = true;
+}
+
+static void
+anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
+{
+   assert(!anv_use_relocations(cmd_buffer->device->physical));
+
+   struct anv_batch_bo *last_bbo =
+      list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
+   last_bbo->chained = false;
+
+   uint32_t *batch = cmd_buffer->batch_end;
+   anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END,
+                   __anv_cmd_header(GFX8_MI_BATCH_BUFFER_END));
+}
+
+static VkResult
+anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
+{
+   struct anv_cmd_buffer *cmd_buffer = _data;
+   struct anv_batch_bo *new_bbo = NULL;
+   /* Cap reallocation to chunk. */
+   uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size,
+                              ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+
+   VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   cmd_buffer->total_batch_size += alloc_size;
+
+   struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
+   if (seen_bbo == NULL) {
+      anv_batch_bo_destroy(new_bbo, cmd_buffer);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+   *seen_bbo = new_bbo;
+
+   cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo);
+
+   list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
+
+   anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)
+{
+   struct anv_cmd_buffer *cmd_buffer = _data;
+   struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096,
+                     GFX8_MI_BATCH_BUFFER_START_length * 4);
+
+   return VK_SUCCESS;
+}
+
+/** Allocate a binding table
+ *
+ * This function allocates a binding table.  This is a bit more complicated
+ * than one would think due to a combination of Vulkan driver design and some
+ * unfortunate hardware restrictions.
+ *
+ * The 3DSTATE_BINDING_TABLE_POINTERS_* packets only have a 16-bit field for
+ * the binding table pointer which means that all binding tables need to live
+ * in the bottom 64k of surface state base address.  The way the GL driver has
+ * classically dealt with this restriction is to emit all surface states
+ * on-the-fly into the batch and have a batch buffer smaller than 64k.  This
+ * isn't really an option in Vulkan for a couple of reasons:
+ *
+ *  1) In Vulkan, we have growing (or chaining) batches so surface states have
+ *     to live in their own buffer and we have to be able to re-emit
+ *     STATE_BASE_ADDRESS as needed which requires a full pipeline stall.  In
+ *     order to avoid emitting STATE_BASE_ADDRESS any more often than needed
+ *     (it's not that hard to hit 64k of just binding tables), we allocate
+ *     surface state objects up-front when VkImageView is created.  In order
+ *     for this to work, surface state objects need to be allocated from a
+ *     global buffer.
+ *
+ *  2) We tried to design the surface state system in such a way that it's
+ *     already ready for bindless texturing.  The way bindless texturing works
+ *     on our hardware is that you have a big pool of surface state objects
+ *     (with its own state base address) and the bindless handles are simply
+ *     offsets into that pool.  With the architecture we chose, we already
+ *     have that pool and it's exactly the same pool that we use for regular
+ *     surface states so we should already be ready for bindless.
+ *
+ *  3) For render targets, we need to be able to fill out the surface states
+ *     later in vkBeginRenderPass so that we can assign clear colors
+ *     correctly.  One way to do this would be to just create the surface
+ *     state data and then repeatedly copy it into the surface state BO every
+ *     time we have to re-emit STATE_BASE_ADDRESS.  While this works, it's
+ *     rather annoying and just being able to allocate them up-front and
+ *     re-use them for the entire render pass.
+ *
+ * While none of these are technically blockers for emitting state on the fly
+ * like we do in GL, the ability to have a single surface state pool is
+ * simplifies things greatly.  Unfortunately, it comes at a cost...
+ *
+ * Because of the 64k limitation of 3DSTATE_BINDING_TABLE_POINTERS_*, we can't
+ * place the binding tables just anywhere in surface state base address.
+ * Because 64k isn't a whole lot of space, we can't simply restrict the
+ * surface state buffer to 64k, we have to be more clever.  The solution we've
+ * chosen is to have a block pool with a maximum size of 2G that starts at
+ * zero and grows in both directions.  All surface states are allocated from
+ * the top of the pool (positive offsets) and we allocate blocks (< 64k) of
+ * binding tables from the bottom of the pool (negative offsets).  Every time
+ * we allocate a new binding table block, we set surface state base address to
+ * point to the bottom of the binding table block.  This way all of the
+ * binding tables in the block are in the bottom 64k of surface state base
+ * address.  When we fill out the binding table, we add the distance between
+ * the bottom of our binding table block and zero of the block pool to the
+ * surface state offsets so that they are correct relative to out new surface
+ * state base address at the bottom of the binding table block.
+ *
+ * \see adjust_relocations_from_block_pool()
+ * \see adjust_relocations_too_block_pool()
+ *
+ * \param[in]  entries        The number of surface state entries the binding
+ *                            table should be able to hold.
+ *
+ * \param[out] state_offset   The offset surface surface state base address
+ *                            where the surface states live.  This must be
+ *                            added to the surface state offset when it is
+ *                            written into the binding table entry.
+ *
+ * \return                    An anv_state representing the binding table
+ */
+struct anv_state
+anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t entries, uint32_t *state_offset)
+{
+   struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
+
+   uint32_t bt_size = align_u32(entries * 4, 32);
+
+   struct anv_state state = cmd_buffer->bt_next;
+   if (bt_size > state.alloc_size)
+      return (struct anv_state) { 0 };
+
+   state.alloc_size = bt_size;
+   cmd_buffer->bt_next.offset += bt_size;
+   cmd_buffer->bt_next.map += bt_size;
+   cmd_buffer->bt_next.alloc_size -= bt_size;
+
+   if (cmd_buffer->device->info->verx10 >= 125) {
+      /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
+       * table address independently from surface state base address.  We no
+       * longer need any sort of offsetting.
+       */
+      *state_offset = 0;
+   } else {
+      assert(bt_block->offset < 0);
+      *state_offset = -bt_block->offset;
+   }
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+   return anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
+                                 isl_dev->ss.size, isl_dev->ss.align);
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment)
+{
+   return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+                                 size, alignment);
+}
+
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
+   if (bt_block == NULL) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
+
+   /* The bt_next state is a rolling state (we update it as we suballocate
+    * from it) which is relative to the start of the binding table block.
+    */
+   cmd_buffer->bt_next = *bt_block;
+   cmd_buffer->bt_next.offset = 0;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch_bo *batch_bo = NULL;
+   VkResult result;
+
+   list_inithead(&cmd_buffer->batch_bos);
+
+   cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
+
+   result = anv_batch_bo_create(cmd_buffer,
+                                cmd_buffer->total_batch_size,
+                                &batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
+
+   cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
+   cmd_buffer->batch.user_data = cmd_buffer;
+
+   if (cmd_buffer->device->can_chain_batches) {
+      cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
+   } else {
+      cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch;
+   }
+
+   anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
+                      GFX8_MI_BATCH_BUFFER_START_length * 4);
+
+   int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
+                                    sizeof(struct anv_bo *));
+   if (!success)
+      goto fail_batch_bo;
+
+   *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
+
+   success = u_vector_init(&cmd_buffer->bt_block_states, 8,
+                           sizeof(struct anv_state));
+   if (!success)
+      goto fail_seen_bbos;
+
+   result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
+                                &cmd_buffer->vk.pool->alloc);
+   if (result != VK_SUCCESS)
+      goto fail_bt_blocks;
+   cmd_buffer->last_ss_pool_center = 0;
+
+   result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+   if (result != VK_SUCCESS)
+      goto fail_bt_blocks;
+
+   return VK_SUCCESS;
+
+ fail_bt_blocks:
+   u_vector_finish(&cmd_buffer->bt_block_states);
+ fail_seen_bbos:
+   u_vector_finish(&cmd_buffer->seen_bbos);
+ fail_batch_bo:
+   anv_batch_bo_destroy(batch_bo, cmd_buffer);
+
+   return result;
+}
+
+void
+anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_state *bt_block;
+   u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
+      anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
+   u_vector_finish(&cmd_buffer->bt_block_states);
+
+   anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->vk.pool->alloc);
+
+   u_vector_finish(&cmd_buffer->seen_bbos);
+
+   /* Destroy all of the batch buffers */
+   list_for_each_entry_safe(struct anv_batch_bo, bbo,
+                            &cmd_buffer->batch_bos, link) {
+      list_del(&bbo->link);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+}
+
+void
+anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* Delete all but the first batch bo */
+   assert(!list_is_empty(&cmd_buffer->batch_bos));
+   while (cmd_buffer->batch_bos.next != cmd_buffer->batch_bos.prev) {
+      struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+      list_del(&bbo->link);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+   assert(!list_is_empty(&cmd_buffer->batch_bos));
+
+   anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
+                      &cmd_buffer->batch,
+                      GFX8_MI_BATCH_BUFFER_START_length * 4);
+
+   while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {
+      struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
+      anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
+   }
+   assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);
+   cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states);
+   cmd_buffer->bt_next.offset = 0;
+
+   anv_reloc_list_clear(&cmd_buffer->surface_relocs);
+   cmd_buffer->last_ss_pool_center = 0;
+
+   /* Reset the list of seen buffers */
+   cmd_buffer->seen_bbos.head = 0;
+   cmd_buffer->seen_bbos.tail = 0;
+
+   struct anv_batch_bo *first_bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
+
+
+   assert(!cmd_buffer->device->can_chain_batches ||
+          first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
+   cmd_buffer->total_batch_size = first_bbo->bo->size;
+}
+
+void
+anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      /* When we start a batch buffer, we subtract a certain amount of
+       * padding from the end to ensure that we always have room to emit a
+       * BATCH_BUFFER_START to chain to the next BO.  We need to remove
+       * that padding before we end the batch; otherwise, we may end up
+       * with our BATCH_BUFFER_END in another BO.
+       */
+      cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+      assert(cmd_buffer->batch.start == batch_bo->bo->map);
+      assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
+
+      /* Save end instruction location to override it later. */
+      cmd_buffer->batch_end = cmd_buffer->batch.next;
+
+      /* If we can chain this command buffer to another one, leave some place
+       * for the jump instruction.
+       */
+      batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
+      if (batch_bo->chained)
+         emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+      else
+         anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe);
+
+      /* Round batch up to an even number of dwords. */
+      if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
+         anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
+
+      cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
+   } else {
+      assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      /* If this is a secondary command buffer, we need to determine the
+       * mode in which it will be executed with vkExecuteCommands.  We
+       * determine this statically here so that this stays in sync with the
+       * actual ExecuteCommands implementation.
+       */
+      const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
+      if (!cmd_buffer->device->can_chain_batches) {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT;
+      } else if (cmd_buffer->device->physical->use_call_secondary) {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
+         /* If the secondary command buffer begins & ends in the same BO and
+          * its length is less than the length of CS prefetch, add some NOOPs
+          * instructions so the last MI_BATCH_BUFFER_START is outside the CS
+          * prefetch.
+          */
+         if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {
+            const struct intel_device_info *devinfo = cmd_buffer->device->info;
+            /* Careful to have everything in signed integer. */
+            int32_t prefetch_len = devinfo->cs_prefetch_size;
+            int32_t batch_len =
+               cmd_buffer->batch.next - cmd_buffer->batch.start;
+
+            for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)
+               anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
+         }
+
+         void *jump_addr =
+            anv_batch_emitn(&cmd_buffer->batch,
+                            GFX8_MI_BATCH_BUFFER_START_length,
+                            GFX8_MI_BATCH_BUFFER_START,
+                            .AddressSpaceIndicator = ASI_PPGTT,
+                            .SecondLevelBatchBuffer = Firstlevelbatch) +
+            (GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
+         cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
+
+         /* The emit above may have caused us to chain batch buffers which
+          * would mean that batch_bo is no longer valid.
+          */
+         batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+      } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
+                 (length < ANV_MIN_CMD_BUFFER_BATCH_SIZE / 2)) {
+         /* If the secondary has exactly one batch buffer in its list *and*
+          * that batch buffer is less than half of the maximum size, we're
+          * probably better of simply copying it into our batch.
+          */
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_EMIT;
+      } else if (!(cmd_buffer->usage_flags &
+                   VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
+
+         /* In order to chain, we need this command buffer to contain an
+          * MI_BATCH_BUFFER_START which will jump back to the calling batch.
+          * It doesn't matter where it points now so long as has a valid
+          * relocation.  We'll adjust it later as part of the chaining
+          * process.
+          *
+          * We set the end of the batch a little short so we would be sure we
+          * have room for the chaining command.  Since we're about to emit the
+          * chaining command, let's set it back where it should go.
+          */
+         cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+         assert(cmd_buffer->batch.start == batch_bo->bo->map);
+         assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
+
+         emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+         assert(cmd_buffer->batch.start == batch_bo->bo->map);
+      } else {
+         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
+      }
+   }
+
+   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
+}
+
+static VkResult
+anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
+                             struct list_head *list)
+{
+   list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
+      struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
+      if (bbo_ptr == NULL)
+         return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      *bbo_ptr = bbo;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
+                             struct anv_cmd_buffer *secondary)
+{
+   anv_measure_add_secondary(primary, secondary);
+   switch (secondary->exec_mode) {
+   case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
+      anv_batch_emit_batch(&primary->batch, &secondary->batch);
+      break;
+   case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: {
+      struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary);
+      unsigned length = secondary->batch.end - secondary->batch.start;
+      anv_batch_bo_grow(primary, bbo, &primary->batch, length,
+                        GFX8_MI_BATCH_BUFFER_START_length * 4);
+      anv_batch_emit_batch(&primary->batch, &secondary->batch);
+      break;
+   }
+   case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
+      struct anv_batch_bo *first_bbo =
+         list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
+      struct anv_batch_bo *last_bbo =
+         list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
+
+      emit_batch_buffer_start(primary, first_bbo->bo, 0);
+
+      struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
+      assert(primary->batch.start == this_bbo->bo->map);
+      uint32_t offset = primary->batch.next - primary->batch.start;
+
+      /* Make the tail of the secondary point back to right after the
+       * MI_BATCH_BUFFER_START in the primary batch.
+       */
+      anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
+
+      anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
+      break;
+   }
+   case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
+      struct list_head copy_list;
+      VkResult result = anv_batch_bo_list_clone(&secondary->batch_bos,
+                                                secondary,
+                                                &copy_list);
+      if (result != VK_SUCCESS)
+         return; /* FIXME */
+
+      anv_cmd_buffer_add_seen_bbos(primary, &copy_list);
+
+      struct anv_batch_bo *first_bbo =
+         list_first_entry(&copy_list, struct anv_batch_bo, link);
+      struct anv_batch_bo *last_bbo =
+         list_last_entry(&copy_list, struct anv_batch_bo, link);
+
+      cmd_buffer_chain_to_batch_bo(primary, first_bbo);
+
+      list_splicetail(&copy_list, &primary->batch_bos);
+
+      anv_batch_bo_continue(last_bbo, &primary->batch,
+                            GFX8_MI_BATCH_BUFFER_START_length * 4);
+      break;
+   }
+   case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
+      struct anv_batch_bo *first_bbo =
+         list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
+
+      uint64_t *write_return_addr =
+         anv_batch_emitn(&primary->batch,
+                         GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */,
+                         GFX8_MI_STORE_DATA_IMM,
+                         .Address = secondary->return_addr)
+         + (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8);
+
+      emit_batch_buffer_start(primary, first_bbo->bo, 0);
+
+      *write_return_addr =
+         anv_address_physical(anv_batch_address(&primary->batch,
+                                                primary->batch.next));
+
+      anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
+      break;
+   }
+   default:
+      assert(!"Invalid execution mode");
+   }
+
+   anv_reloc_list_append(&primary->surface_relocs, &primary->vk.pool->alloc,
+                         &secondary->surface_relocs, 0);
+}
+
+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   struct anv_bo **                          bos;
+
+   /* Allocated length of the 'objects' and 'bos' arrays */
+   uint32_t                                  array_length;
+
+   uint32_t                                  syncobj_count;
+   uint32_t                                  syncobj_array_length;
+   struct drm_i915_gem_exec_fence *          syncobjs;
+   uint64_t *                                syncobj_values;
+
+   /* List of relocations for surface states, only used with platforms not
+    * using softpin.
+    */
+   void *                                    surface_states_relocs;
+
+   uint32_t                                  cmd_buffer_count;
+   struct anv_query_pool                     *perf_query_pool;
+
+   /* Indicates whether any of the command buffers have relocations. This
+    * doesn't not necessarily mean we'll need the kernel to process them. It
+    * might be that a previous execbuf has already placed things in the VMA
+    * and we can make i915 skip the relocations.
+    */
+   bool                                      has_relocs;
+
+   const VkAllocationCallbacks *             alloc;
+   VkSystemAllocationScope                   alloc_scope;
+
+   int                                       perf_query_pass;
+};
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec)
+{
+   vk_free(exec->alloc, exec->syncobjs);
+   vk_free(exec->alloc, exec->syncobj_values);
+   vk_free(exec->alloc, exec->surface_states_relocs);
+   vk_free(exec->alloc, exec->objects);
+   vk_free(exec->alloc, exec->bos);
+}
+
+static void
+anv_execbuf_add_ext(struct anv_execbuf *exec,
+                    uint32_t ext_name,
+                    struct i915_user_extension *ext)
+{
+   __u64 *iter = &exec->execbuf.cliprects_ptr;
+
+   exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
+
+   while (*iter != 0) {
+      iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+   }
+
+   ext->name = ext_name;
+
+   *iter = (uintptr_t) ext;
+}
+
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags);
+
+static VkResult
+anv_execbuf_add_bo(struct anv_device *device,
+                   struct anv_execbuf *exec,
+                   struct anv_bo *bo,
+                   struct anv_reloc_list *relocs,
+                   uint32_t extra_flags)
+{
+   struct drm_i915_gem_exec_object2 *obj = NULL;
+
+   bo = anv_bo_unwrap(bo);
+
+   if (bo->exec_obj_index < exec->bo_count &&
+       exec->bos[bo->exec_obj_index] == bo)
+      obj = &exec->objects[bo->exec_obj_index];
+
+   if (obj == NULL) {
+      /* We've never seen this one before.  Add it to the list and assign
+       * an id that we can use later.
+       */
+      if (exec->bo_count >= exec->array_length) {
+         uint32_t new_len = exec->objects ? exec->array_length * 2 : 64;
+
+         struct drm_i915_gem_exec_object2 *new_objects =
+            vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope);
+         if (new_objects == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         struct anv_bo **new_bos =
+            vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope);
+         if (new_bos == NULL) {
+            vk_free(exec->alloc, new_objects);
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         }
+
+         if (exec->objects) {
+            memcpy(new_objects, exec->objects,
+                   exec->bo_count * sizeof(*new_objects));
+            memcpy(new_bos, exec->bos,
+                   exec->bo_count * sizeof(*new_bos));
+         }
+
+         vk_free(exec->alloc, exec->objects);
+         vk_free(exec->alloc, exec->bos);
+
+         exec->objects = new_objects;
+         exec->bos = new_bos;
+         exec->array_length = new_len;
+      }
+
+      assert(exec->bo_count < exec->array_length);
+
+      bo->exec_obj_index = exec->bo_count++;
+      obj = &exec->objects[bo->exec_obj_index];
+      exec->bos[bo->exec_obj_index] = bo;
+
+      obj->handle = bo->gem_handle;
+      obj->relocation_count = 0;
+      obj->relocs_ptr = 0;
+      obj->alignment = 0;
+      obj->offset = bo->offset;
+      obj->flags = bo->flags | extra_flags;
+      obj->rsvd1 = 0;
+      obj->rsvd2 = 0;
+   }
+
+   if (extra_flags & EXEC_OBJECT_WRITE) {
+      obj->flags |= EXEC_OBJECT_WRITE;
+      obj->flags &= ~EXEC_OBJECT_ASYNC;
+   }
+
+   if (relocs != NULL) {
+      assert(obj->relocation_count == 0);
+
+      if (relocs->num_relocs > 0) {
+         /* This is the first time we've ever seen a list of relocations for
+          * this BO.  Go ahead and set the relocations and then walk the list
+          * of relocations and add them all.
+          */
+         exec->has_relocs = true;
+         obj->relocation_count = relocs->num_relocs;
+         obj->relocs_ptr = (uintptr_t) relocs->relocs;
+
+         for (size_t i = 0; i < relocs->num_relocs; i++) {
+            VkResult result;
+
+            /* A quick sanity check on relocations */
+            assert(relocs->relocs[i].offset < bo->size);
+            result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
+                                        NULL, extra_flags);
+            if (result != VK_SUCCESS)
+               return result;
+         }
+      }
+
+      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
+                                       relocs->deps, extra_flags);
+   }
+
+   return VK_SUCCESS;
+}
+
+/* Add BO dependencies to execbuf */
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags)
+{
+   for (uint32_t w = 0; w < dep_words; w++) {
+      BITSET_WORD mask = deps[w];
+      while (mask) {
+         int i = u_bit_scan(&mask);
+         uint32_t gem_handle = w * BITSET_WORDBITS + i;
+         struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+         assert(bo->refcount > 0);
+         VkResult result =
+            anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
+                              struct anv_reloc_list *list)
+{
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      list->relocs[i].target_handle =
+         anv_bo_unwrap(list->reloc_bos[i])->exec_obj_index;
+   }
+}
+
+static void
+adjust_relocations_from_state_pool(struct anv_state_pool *pool,
+                                   struct anv_reloc_list *relocs,
+                                   uint32_t last_pool_center_bo_offset)
+{
+   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
+   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
+
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      /* All of the relocations from this block pool to other BO's should
+       * have been emitted relative to the surface block pool center.  We
+       * need to add the center offset to make them relative to the
+       * beginning of the actual GEM bo.
+       */
+      relocs->relocs[i].offset += delta;
+   }
+}
+
+static void
+adjust_relocations_to_state_pool(struct anv_state_pool *pool,
+                                 struct anv_bo *from_bo,
+                                 struct anv_reloc_list *relocs,
+                                 uint32_t last_pool_center_bo_offset)
+{
+   assert(!from_bo->is_wrapper);
+   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
+   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
+
+   /* When we initially emit relocations into a block pool, we don't
+    * actually know what the final center_bo_offset will be so we just emit
+    * it as if center_bo_offset == 0.  Now that we know what the center
+    * offset is, we need to walk the list of relocations and adjust any
+    * relocations that point to the pool bo with the correct offset.
+    */
+   for (size_t i = 0; i < relocs->num_relocs; i++) {
+      if (relocs->reloc_bos[i] == pool->block_pool.bo) {
+         /* Adjust the delta value in the relocation to correctly
+          * correspond to the new delta.  Initially, this value may have
+          * been negative (if treated as unsigned), but we trust in
+          * uint32_t roll-over to fix that for us at this point.
+          */
+         relocs->relocs[i].delta += delta;
+
+         /* Since the delta has changed, we need to update the actual
+          * relocated value with the new presumed value.  This function
+          * should only be called on batch buffers, so we know it isn't in
+          * use by the GPU at the moment.
+          */
+         assert(relocs->relocs[i].offset < from_bo->size);
+         write_reloc(pool->block_pool.device,
+                     from_bo->map + relocs->relocs[i].offset,
+                     relocs->relocs[i].presumed_offset +
+                     relocs->relocs[i].delta, false);
+      }
+   }
+}
+
+static void
+anv_reloc_list_apply(struct anv_device *device,
+                     struct anv_reloc_list *list,
+                     struct anv_bo *bo,
+                     bool always_relocate)
+{
+   bo = anv_bo_unwrap(bo);
+
+   for (size_t i = 0; i < list->num_relocs; i++) {
+      struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]);
+      if (list->relocs[i].presumed_offset == target_bo->offset &&
+          !always_relocate)
+         continue;
+
+      void *p = bo->map + list->relocs[i].offset;
+      write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
+      list->relocs[i].presumed_offset = target_bo->offset;
+   }
+}
+
+/**
+ * This function applies the relocation for a command buffer and writes the
+ * actual addresses into the buffers as per what we were told by the kernel on
+ * the previous execbuf2 call.  This should be safe to do because, for each
+ * relocated address, we have two cases:
+ *
+ *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
+ *     not in use by the GPU so updating the address is 100% ok.  It won't be
+ *     in-use by the GPU (from our context) again until the next execbuf2
+ *     happens.  If the kernel decides to move it in the next execbuf2, it
+ *     will have to do the relocations itself, but that's ok because it should
+ *     have all of the information needed to do so.
+ *
+ *  2) The target BO is active (as seen by the kernel).  In this case, it
+ *     hasn't moved since the last execbuffer2 call because GTT shuffling
+ *     *only* happens when the BO is idle. (From our perspective, it only
+ *     happens inside the execbuffer2 ioctl, but the shuffling may be
+ *     triggered by another ioctl, with full-ppgtt this is limited to only
+ *     execbuffer2 ioctls on the same context, or memory pressure.)  Since the
+ *     target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
+ *     address and the relocated value we are writing into the BO will be the
+ *     same as the value that is already there.
+ *
+ *     There is also a possibility that the target BO is active but the exact
+ *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
+ *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
+ *     may be stale but it's still safe to write the relocation because that
+ *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
+ *     won't be until the next execbuf2 call.
+ *
+ * By doing relocations on the CPU, we can tell the kernel that it doesn't
+ * need to bother.  We want to do this because the surface state buffer is
+ * used by every command buffer so, if the kernel does the relocations, it
+ * will always be busy and the kernel will always stall.  This is also
+ * probably the fastest mechanism for doing relocations since the kernel would
+ * have to make a full copy of all the relocations lists.
+ */
+static bool
+execbuf_can_skip_relocations(struct anv_execbuf *exec)
+{
+   if (!exec->has_relocs)
+      return true;
+
+   static int userspace_relocs = -1;
+   if (userspace_relocs < 0)
+      userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
+   if (!userspace_relocs)
+      return false;
+
+   /* First, we have to check to see whether or not we can even do the
+    * relocation.  New buffers which have never been submitted to the kernel
+    * don't have a valid offset so we need to let the kernel do relocations so
+    * that we can get offsets for them.  On future execbuf2 calls, those
+    * buffers will have offsets and we will be able to skip relocating.
+    * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
+    */
+   for (uint32_t i = 0; i < exec->bo_count; i++) {
+      assert(!exec->bos[i]->is_wrapper);
+      if (exec->bos[i]->offset == (uint64_t)-1)
+         return false;
+   }
+
+   return true;
+}
+
+static void
+relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_execbuf *exec)
+{
+   /* Since surface states are shared between command buffers and we don't
+    * know what order they will be submitted to the kernel, we don't know
+    * what address is actually written in the surface state object at any
+    * given time.  The only option is to always relocate them.
+    */
+   struct anv_bo *surface_state_bo =
+      anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo);
+   anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
+                        surface_state_bo,
+                        true /* always relocate surface states */);
+
+   /* Since we own all of the batch buffers, we know what values are stored
+    * in the relocated addresses and only have to update them if the offsets
+    * have changed.
+    */
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      anv_reloc_list_apply(cmd_buffer->device,
+                           &(*bbo)->relocs, (*bbo)->bo, false);
+   }
+
+   for (uint32_t i = 0; i < exec->bo_count; i++)
+      exec->objects[i].offset = exec->bos[i]->offset;
+}
+
+static void
+reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* In the case where we fall back to doing kernel relocations, we need to
+    * ensure that the relocation list is valid. All relocations on the batch
+    * buffers are already valid and kept up-to-date. Since surface states are
+    * shared between command buffers and we don't know what order they will be
+    * submitted to the kernel, we don't know what address is actually written
+    * in the surface state object at any given time. The only option is to set
+    * a bogus presumed offset and let the kernel relocate them.
+    */
+   for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
+      cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
+}
+
+static VkResult
+anv_execbuf_add_syncobj(struct anv_device *device,
+                        struct anv_execbuf *exec,
+                        uint32_t syncobj,
+                        uint32_t flags,
+                        uint64_t timeline_value)
+{
+   if (exec->syncobj_count >= exec->syncobj_array_length) {
+      uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16);
+
+      struct drm_i915_gem_exec_fence *new_syncobjs =
+         vk_alloc(exec->alloc, new_len * sizeof(*new_syncobjs),
+                  8, exec->alloc_scope);
+      if (!new_syncobjs)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      if (exec->syncobjs)
+         typed_memcpy(new_syncobjs, exec->syncobjs, exec->syncobj_count);
+
+      exec->syncobjs = new_syncobjs;
+
+      if (exec->syncobj_values) {
+         uint64_t *new_syncobj_values =
+            vk_alloc(exec->alloc, new_len * sizeof(*new_syncobj_values),
+                     8, exec->alloc_scope);
+         if (!new_syncobj_values)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         typed_memcpy(new_syncobj_values, exec->syncobj_values,
+                      exec->syncobj_count);
+
+         exec->syncobj_values = new_syncobj_values;
+      }
+
+      exec->syncobj_array_length = new_len;
+   }
+
+   if (timeline_value && !exec->syncobj_values) {
+      exec->syncobj_values =
+         vk_zalloc(exec->alloc, exec->syncobj_array_length *
+                                sizeof(*exec->syncobj_values),
+                   8, exec->alloc_scope);
+      if (!exec->syncobj_values)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) {
+      .handle = syncobj,
+      .flags = flags,
+   };
+   if (timeline_value)
+      exec->syncobj_values[exec->syncobj_count] = timeline_value;
+
+   exec->syncobj_count++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_sync(struct anv_device *device,
+                     struct anv_execbuf *execbuf,
+                     struct vk_sync *sync,
+                     bool is_signal,
+                     uint64_t value)
+{
+   /* It's illegal to signal a timeline with value 0 because that's never
+    * higher than the current value.  A timeline wait on value 0 is always
+    * trivial because 0 <= uint64_t always.
+    */
+   if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0)
+      return VK_SUCCESS;
+
+   if (vk_sync_is_anv_bo_sync(sync)) {
+      struct anv_bo_sync *bo_sync =
+         container_of(sync, struct anv_bo_sync, sync);
+
+      assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET));
+
+      return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL,
+                                is_signal ? EXEC_OBJECT_WRITE : 0);
+   } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
+      struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync);
+
+      if (!(sync->flags & VK_SYNC_IS_TIMELINE))
+         value = 0;
+
+      return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj,
+                                     is_signal ? I915_EXEC_FENCE_SIGNAL :
+                                                 I915_EXEC_FENCE_WAIT,
+                                     value);
+   }
+
+   unreachable("Invalid sync type");
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
+                             struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_state_pool *ss_pool =
+      &cmd_buffer->device->surface_state_pool;
+
+   adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
+                                      cmd_buffer->last_ss_pool_center);
+   VkResult result;
+   if (anv_use_relocations(cmd_buffer->device->physical)) {
+      /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs
+       * will get added automatically by processing relocations on the batch
+       * buffer.  We have to add the surface state BO manually because it has
+       * relocations of its own that we need to be sure are processed.
+       */
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  ss_pool->block_pool.bo,
+                                  &cmd_buffer->surface_relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   } else {
+      /* Add surface dependencies (BOs) to the execbuf */
+      anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
+                                cmd_buffer->surface_relocs.dep_words,
+                                cmd_buffer->surface_relocs.deps, 0);
+   }
+
+   /* First, we walk over all of the bos we've seen and add them and their
+    * relocations to the validate list.
+    */
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs,
+                                       cmd_buffer->last_ss_pool_center);
+
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  (*bbo)->bo, &(*bbo)->relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Now that we've adjusted all of the surface state relocations, we need to
+    * record the surface state pool center so future executions of the command
+    * buffer can adjust correctly.
+    */
+   cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;
+
+   return VK_SUCCESS;
+}
+
+static void
+chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+                      uint32_t num_cmd_buffers)
+{
+   if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
+      assert(num_cmd_buffers == 1);
+      return;
+   }
+
+   /* Chain the N-1 first batch buffers */
+   for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)
+      anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
+
+   /* Put an end to the last one */
+   anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
+                              struct anv_queue *queue,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t num_cmd_buffers)
+{
+   struct anv_device *device = queue->device;
+   struct anv_state_pool *ss_pool = &device->surface_state_pool;
+   VkResult result;
+
+   /* Edit the tail of the command buffers to chain them all together if they
+    * can be.
+    */
+   chain_command_buffers(cmd_buffers, num_cmd_buffers);
+
+   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+      anv_measure_submit(cmd_buffers[i]);
+      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add all the global BOs to the object list for softpin case. */
+   if (!anv_use_relocations(device->physical)) {
+      anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      struct anv_block_pool *pool;
+      pool = &device->dynamic_state_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      pool = &device->general_state_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      pool = &device->instruction_state_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      pool = &device->binding_table_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      /* Add the BOs for all user allocated memory objects because we can't
+       * track after binding updates of VK_EXT_descriptor_indexing.
+       */
+      list_for_each_entry(struct anv_device_memory, mem,
+                          &device->memory_objects, link) {
+         result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   } else {
+      /* We do not support chaining primary command buffers without
+       * softpin.
+       */
+      assert(num_cmd_buffers == 1);
+   }
+
+   bool no_reloc = true;
+   if (execbuf->has_relocs) {
+      no_reloc = execbuf_can_skip_relocations(execbuf);
+      if (no_reloc) {
+         /* If we were able to successfully relocate everything, tell the
+          * kernel that it can skip doing relocations. The requirement for
+          * using NO_RELOC is:
+          *
+          *  1) The addresses written in the objects must match the
+          *     corresponding reloc.presumed_offset which in turn must match
+          *     the corresponding execobject.offset.
+          *
+          *  2) To avoid stalling, execobject.offset should match the current
+          *     address of that object within the active context.
+          *
+          * In order to satisfy all of the invariants that make userspace
+          * relocations to be safe (see relocate_cmd_buffer()), we need to
+          * further ensure that the addresses we use match those used by the
+          * kernel for the most recent execbuf2.
+          *
+          * The kernel may still choose to do relocations anyway if something
+          * has moved in the GTT. In this case, the relocation list still
+          * needs to be valid. All relocations on the batch buffers are
+          * already valid and kept up-to-date. For surface state relocations,
+          * by applying the relocations in relocate_cmd_buffer, we ensured
+          * that the address in the RENDER_SURFACE_STATE matches
+          * presumed_offset, so it should be safe for the kernel to relocate
+          * them as needed.
+          */
+         for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+            relocate_cmd_buffer(cmd_buffers[i], execbuf);
+
+            anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
+                                 device->surface_state_pool.block_pool.bo,
+                                 true /* always relocate surface states */);
+         }
+      } else {
+         /* In the case where we fall back to doing kernel relocations, we
+          * need to ensure that the relocation list is valid. All relocations
+          * on the batch buffers are already valid and kept up-to-date. Since
+          * surface states are shared between command buffers and we don't
+          * know what order they will be submitted to the kernel, we don't
+          * know what address is actually written in the surface state object
+          * at any given time. The only option is to set a bogus presumed
+          * offset and let the kernel relocate them.
+          */
+         for (uint32_t i = 0; i < num_cmd_buffers; i++)
+            reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
+      }
+   }
+
+   struct anv_batch_bo *first_batch_bo =
+      list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
+
+   /* The kernel requires that the last entry in the validation list be the
+    * batch buffer to execute.  We can simply swap the element
+    * corresponding to the first batch_bo in the chain with the last
+    * element in the list.
+    */
+   if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = first_batch_bo->bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == first_batch_bo->bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = first_batch_bo->bo;
+      first_batch_bo->bo->exec_obj_index = last_idx;
+   }
+
+   /* If we are pinning our BOs, we shouldn't have to relocate anything */
+   if (!anv_use_relocations(device->physical))
+      assert(!execbuf->has_relocs);
+
+   /* Now we go through and fixup all of the relocation lists to point to the
+    * correct indices in the object array (I915_EXEC_HANDLE_LUT).  We have to
+    * do this after we reorder the list above as some of the indices may have
+    * changed.
+    */
+   struct anv_batch_bo **bbo;
+   if (execbuf->has_relocs) {
+      assert(num_cmd_buffers == 1);
+      u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
+         anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
+
+      anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
+   }
+
+   if (device->physical->memory.need_clflush) {
+      __builtin_ia32_mfence();
+      for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+         u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
+            for (uint32_t l = 0; l < (*bbo)->length; l += CACHELINE_SIZE)
+               __builtin_ia32_clflush((*bbo)->bo->map + l);
+         }
+      }
+   }
+
+   struct anv_batch *batch = &cmd_buffers[0]->batch;
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      /* On platforms that cannot chain batch buffers because of the i915
+       * command parser, we have to provide the batch length. Everywhere else
+       * we'll chain batches so no point in passing a length.
+       */
+      .batch_len = device->can_chain_batches ? 0 : batch->next - batch->start,
+      .cliprects_ptr = 0,
+      .num_cliprects = 0,
+      .DR1 = 0,
+      .DR4 = 0,
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        device->trivial_batch_bo,
+                                        NULL, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
+                     struct anv_utrace_flush_copy *flush)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        flush->batch_bo,
+                                        &flush->relocs, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = anv_execbuf_add_sync(device, execbuf, flush->sync,
+                                 true /* is_signal */, 0 /* value */);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (flush->batch_bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = flush->batch_bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == flush->batch_bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = flush->batch_bo;
+      flush->batch_bo->exec_obj_index = last_idx;
+   }
+
+   if (device->physical->memory.need_clflush)
+      intel_flush_range(flush->batch_bo->map, flush->batch_bo->size);
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = flush->batch.next - flush->batch.start,
+      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_FENCE_ARRAY | queue->exec_flags |
+               (execbuf->has_relocs ? 0 : I915_EXEC_NO_RELOC),
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+      .num_cliprects = execbuf->syncobj_count,
+      .cliprects_ptr = (uintptr_t)execbuf->syncobjs,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_queue_exec_utrace_locked(struct anv_queue *queue,
+                             struct anv_utrace_flush_copy *flush)
+{
+   assert(flush->batch_bo);
+
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   VkResult result = setup_utrace_execbuf(&execbuf, queue, flush);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret)
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+
+   struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
+   for (uint32_t k = 0; k < execbuf.bo_count; k++) {
+      if (anv_bo_is_pinned(execbuf.bos[k]))
+         assert(execbuf.bos[k]->offset == objects[k].offset);
+      execbuf.bos[k]->offset = objects[k].offset;
+   }
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   return result;
+}
+
+/* We lock around execbuf for three main reasons:
+ *
+ *  1) When a block pool is resized, we create a new gem handle with a
+ *     different size and, in the case of surface states, possibly a different
+ *     center offset but we re-use the same anv_bo struct when we do so. If
+ *     this happens in the middle of setting up an execbuf, we could end up
+ *     with our list of BOs out of sync with our list of gem handles.
+ *
+ *  2) The algorithm we use for building the list of unique buffers isn't
+ *     thread-safe. While the client is supposed to synchronize around
+ *     QueueSubmit, this would be extremely difficult to debug if it ever came
+ *     up in the wild due to a broken app. It's better to play it safe and
+ *     just lock around QueueSubmit.
+ *
+ *  3) The anv_cmd_buffer_execbuf function may perform relocations in
+ *      userspace. Due to the fact that the surface state buffer is shared
+ *      between batches, we can't afford to have that happen from multiple
+ *      threads at the same time. Even though the user is supposed to ensure
+ *      this doesn't happen, we play it safe as in (2) above.
+ *
+ * Since the only other things that ever take the device lock such as block
+ * pool resize only rarely happen, this will almost never be contended so
+ * taking a lock isn't really an expensive operation in this case.
+ */
+static VkResult
+anv_queue_exec_locked(struct anv_queue *queue,
+                      uint32_t wait_count,
+                      const struct vk_sync_wait *waits,
+                      uint32_t cmd_buffer_count,
+                      struct anv_cmd_buffer **cmd_buffers,
+                      uint32_t signal_count,
+                      const struct vk_sync_signal *signals,
+                      struct anv_query_pool *perf_query_pool,
+                      uint32_t perf_query_pass)
+{
+   struct anv_device *device = queue->device;
+   struct anv_utrace_flush_copy *utrace_flush_data = NULL;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+      .perf_query_pass = perf_query_pass,
+   };
+
+   /* Flush the trace points first, they need to be moved */
+   VkResult result =
+      anv_device_utrace_flush_cmd_buffers(queue,
+                                          cmd_buffer_count,
+                                          cmd_buffers,
+                                          &utrace_flush_data);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   if (utrace_flush_data && !utrace_flush_data->batch_bo) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    utrace_flush_data->sync,
+                                    true /* is_signal */,
+                                    0);
+      if (result != VK_SUCCESS)
+         goto error;
+
+      utrace_flush_data = NULL;
+   }
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   result =
+      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    waits[i].sync,
+                                    false /* is_signal */,
+                                    waits[i].wait_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   for (uint32_t i = 0; i < signal_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    signals[i].sync,
+                                    true /* is_signal */,
+                                    signals[i].signal_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (queue->sync) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    queue->sync,
+                                    true /* is_signal */,
+                                    0 /* signal_value */);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (cmd_buffer_count) {
+      result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
+                                             cmd_buffers,
+                                             cmd_buffer_count);
+   } else {
+      result = setup_empty_execbuf(&execbuf, queue);
+   }
+
+   if (result != VK_SUCCESS)
+      goto error;
+
+   const bool has_perf_query =
+      perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT)) {
+      fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n",
+              execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len);
+      for (uint32_t i = 0; i < execbuf.bo_count; i++) {
+         const struct anv_bo *bo = execbuf.bos[i];
+
+         fprintf(stderr, "   BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=0x%010"PRIx64
+                 " handle=%05u name=%s\n",
+                 bo->offset, bo->offset + bo->size - 1, bo->size, bo->gem_handle, bo->name);
+      }
+   }
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));
+      if (cmd_buffer_count) {
+         if (has_perf_query) {
+            struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+            uint64_t pass_batch_offset =
+               khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
+
+            intel_print_batch(&device->decoder_ctx,
+                              pass_batch_bo->map + pass_batch_offset, 64,
+                              pass_batch_bo->offset + pass_batch_offset, false);
+         }
+
+         for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+            struct anv_batch_bo **bo =
+               u_vector_tail(&cmd_buffers[i]->seen_bbos);
+            device->cmd_buffer_being_decoded = cmd_buffers[i];
+            intel_print_batch(&device->decoder_ctx, (*bo)->bo->map,
+                              (*bo)->bo->size, (*bo)->bo->offset, false);
+            device->cmd_buffer_being_decoded = NULL;
+         }
+      } else {
+         intel_print_batch(&device->decoder_ctx,
+                           device->trivial_batch_bo->map,
+                           device->trivial_batch_bo->size,
+                           device->trivial_batch_bo->offset, false);
+      }
+   }
+
+   if (execbuf.syncobj_values) {
+      execbuf.timeline_fences.fence_count = execbuf.syncobj_count;
+      execbuf.timeline_fences.handles_ptr = (uintptr_t)execbuf.syncobjs;
+      execbuf.timeline_fences.values_ptr = (uintptr_t)execbuf.syncobj_values;
+      anv_execbuf_add_ext(&execbuf,
+                          DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
+                          &execbuf.timeline_fences.base);
+   } else if (execbuf.syncobjs) {
+      execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+      execbuf.execbuf.num_cliprects = execbuf.syncobj_count;
+      execbuf.execbuf.cliprects_ptr = (uintptr_t)execbuf.syncobjs;
+   }
+
+   if (has_perf_query) {
+      assert(perf_query_pass < perf_query_pool->n_passes);
+      struct intel_perf_query_info *query_info =
+         perf_query_pool->pass_query[perf_query_pass];
+
+      /* Some performance queries just the pipeline statistic HW, no need for
+       * OA in that case, so no need to reconfigure.
+       */
+      if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
+          (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
+           query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
+         int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                               (void *)(uintptr_t) query_info->oa_metrics_set_id);
+         if (ret < 0) {
+            result = vk_device_set_lost(&device->vk,
+                                        "i915-perf config failed: %s",
+                                        strerror(errno));
+         }
+      }
+
+      struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+
+      struct drm_i915_gem_exec_object2 query_pass_object = {
+         .handle = pass_batch_bo->gem_handle,
+         .offset = pass_batch_bo->offset,
+         .flags  = pass_batch_bo->flags,
+      };
+      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
+         .buffers_ptr = (uintptr_t) &query_pass_object,
+         .buffer_count = 1,
+         .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool,
+                                                              perf_query_pass),
+         .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
+         .rsvd1 = device->context_id,
+      };
+
+      int ret = queue->device->info->no_hw ? 0 :
+         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
+      if (ret)
+         result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret)
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+
+   if (result == VK_SUCCESS && queue->sync) {
+      result = vk_sync_wait(&device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+      if (result != VK_SUCCESS)
+         result = vk_queue_set_lost(&queue->vk, "sync wait failed");
+   }
+
+   struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
+   for (uint32_t k = 0; k < execbuf.bo_count; k++) {
+      if (anv_bo_is_pinned(execbuf.bos[k]))
+         assert(execbuf.bos[k]->offset == objects[k].offset);
+      execbuf.bos[k]->offset = objects[k].offset;
+   }
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   if (result == VK_SUCCESS && utrace_flush_data)
+      result = anv_queue_exec_utrace_locked(queue, utrace_flush_data);
+
+   return result;
+}
+
+static inline bool
+can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
+{
+   return (!p1 || !p2 || p1 == p2);
+}
+
+static VkResult
+anv_queue_submit_locked(struct anv_queue *queue,
+                        struct vk_queue_submit *submit)
+{
+   VkResult result;
+
+   if (submit->command_buffer_count == 0) {
+      result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
+                                     0 /* cmd_buffer_count */,
+                                     NULL /* cmd_buffers */,
+                                     submit->signal_count, submit->signals,
+                                     NULL /* perf_query_pool */,
+                                     0 /* perf_query_pass */);
+      if (result != VK_SUCCESS)
+         return result;
+   } else {
+      /* Everything's easier if we don't have to bother with container_of() */
+      STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
+      struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
+      struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
+      uint32_t start = 0;
+      uint32_t end = submit->command_buffer_count;
+      struct anv_query_pool *perf_query_pool =
+         cmd_buffers[start]->perf_query_pool;
+      for (uint32_t n = 0; n < end; n++) {
+         bool can_chain = false;
+         uint32_t next = n + 1;
+         /* Can we chain the last buffer into the next one? */
+         if (next < end &&
+             anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
+             can_chain_query_pools
+             (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
+            can_chain = true;
+            perf_query_pool =
+               perf_query_pool ? perf_query_pool :
+               cmd_buffers[next]->perf_query_pool;
+         }
+         if (!can_chain) {
+            /* The next buffer cannot be chained, or we have reached the
+             * last buffer, submit what have been chained so far.
+             */
+            VkResult result =
+               anv_queue_exec_locked(queue,
+                                     start == 0 ? submit->wait_count : 0,
+                                     start == 0 ? submit->waits : NULL,
+                                     next - start, &cmd_buffers[start],
+                                     next == end ? submit->signal_count : 0,
+                                     next == end ? submit->signals : NULL,
+                                     perf_query_pool,
+                                     submit->perf_pass_index);
+            if (result != VK_SUCCESS)
+               return result;
+            if (next < end) {
+               start = next;
+               perf_query_pool = cmd_buffers[start]->perf_query_pool;
+            }
+         }
+      }
+   }
+   for (uint32_t i = 0; i < submit->signal_count; i++) {
+      if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
+         continue;
+
+      struct anv_bo_sync *bo_sync =
+         container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
+
+      /* Once the execbuf has returned, we need to set the fence state to
+       * SUBMITTED.  We can't do this before calling execbuf because
+       * anv_GetFenceStatus does take the global device lock before checking
+       * fence->state.
+       *
+       * We set the fence state to SUBMITTED regardless of whether or not the
+       * execbuf succeeds because we need to ensure that vkWaitForFences() and
+       * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
+       * VK_SUCCESS) in a finite amount of time even if execbuf fails.
+       */
+      assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
+      bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
+   }
+
+   pthread_cond_broadcast(&queue->device->queue_submit);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_queue_submit(struct vk_queue *vk_queue,
+                 struct vk_queue_submit *submit)
+{
+   struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   if (queue->device->info->no_hw) {
+      for (uint32_t i = 0; i < submit->signal_count; i++) {
+         result = vk_sync_signal(&device->vk,
+                                 submit->signals[i].sync,
+                                 submit->signals[i].signal_value);
+         if (result != VK_SUCCESS)
+            return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
+      }
+      return VK_SUCCESS;
+   }
+
+   uint64_t start_ts = intel_ds_begin_submit(queue->ds);
+
+   pthread_mutex_lock(&device->mutex);
+   result = anv_queue_submit_locked(queue, submit);
+   /* Take submission ID under lock */
+   pthread_mutex_unlock(&device->mutex);
+
+   intel_ds_end_submit(queue->ds, start_ts);
+
+   return result;
+}
+
+VkResult
+anv_queue_submit_simple_batch(struct anv_queue *queue,
+                              struct anv_batch *batch)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
+   int err;
+
+   if (queue->device->info->no_hw)
+      return VK_SUCCESS;
+
+   /* This is only used by device init so we can assume the queue is empty and
+    * we aren't fighting with a submit thread.
+    */
+   assert(vk_queue_is_empty(&queue->vk));
+
+   uint32_t batch_size = align_u32(batch->next - batch->start, 8);
+
+   struct anv_bo *batch_bo = NULL;
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(batch_bo->map, batch->start, batch_size);
+   if (device->physical->memory.need_clflush)
+      intel_flush_range(batch_bo->map, batch_size);
+
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      intel_print_batch(&device->decoder_ctx,
+                        batch_bo->map,
+                        batch_bo->size,
+                        batch_bo->offset, false);
+   }
+
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = batch_size,
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = device->context_id,
+      .rsvd2 = 0,
+   };
+
+   err = anv_gem_execbuffer(device, &execbuf.execbuf);
+   if (err) {
+      result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m");
+      goto fail;
+   }
+
+   result = anv_device_wait(device, batch_bo, INT64_MAX);
+   if (result != VK_SUCCESS) {
+      result = vk_device_set_lost(&device->vk,
+                                  "anv_device_wait failed: %m");
+      goto fail;
+   }
+
+fail:
+   anv_execbuf_finish(&execbuf);
+   anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
+
+   return result;
+}
diff --git a/src/intel/vulkan_hasvk/anv_blorp.c b/src/intel/vulkan_hasvk/anv_blorp.c
new file mode 100644
index 00000000000..c829cb8aa46
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_blorp.c
@@ -0,0 +1,1995 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+static bool
+lookup_blorp_shader(struct blorp_batch *batch,
+                    const void *key, uint32_t key_size,
+                    uint32_t *kernel_out, void *prog_data_out)
+{
+   struct blorp_context *blorp = batch->blorp;
+   struct anv_device *device = blorp->driver_ctx;
+
+   struct anv_shader_bin *bin =
+      anv_device_search_for_kernel(device, device->internal_cache,
+                                   key, key_size, NULL);
+   if (!bin)
+      return false;
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, bin);
+
+   *kernel_out = bin->kernel.offset;
+   *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data;
+
+   return true;
+}
+
+static bool
+upload_blorp_shader(struct blorp_batch *batch, uint32_t stage,
+                    const void *key, uint32_t key_size,
+                    const void *kernel, uint32_t kernel_size,
+                    const struct brw_stage_prog_data *prog_data,
+                    uint32_t prog_data_size,
+                    uint32_t *kernel_out, void *prog_data_out)
+{
+   struct blorp_context *blorp = batch->blorp;
+   struct anv_device *device = blorp->driver_ctx;
+
+   struct anv_pipeline_bind_map bind_map = {
+      .surface_count = 0,
+      .sampler_count = 0,
+   };
+
+   struct anv_shader_bin *bin =
+      anv_device_upload_kernel(device, device->internal_cache, stage,
+                               key, key_size, kernel, kernel_size,
+                               prog_data, prog_data_size,
+                               NULL, 0, NULL, &bind_map);
+
+   if (!bin)
+      return false;
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, bin);
+
+   *kernel_out = bin->kernel.offset;
+   *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data;
+
+   return true;
+}
+
+void
+anv_device_init_blorp(struct anv_device *device)
+{
+   const struct blorp_config config = {
+      .use_mesh_shading = device->physical->vk.supported_extensions.NV_mesh_shader,
+   };
+
+   blorp_init(&device->blorp, device, &device->isl_dev, &config);
+   device->blorp.compiler = device->physical->compiler;
+   device->blorp.lookup_shader = lookup_blorp_shader;
+   device->blorp.upload_shader = upload_blorp_shader;
+   switch (device->info->verx10) {
+   case 70:
+      device->blorp.exec = gfx7_blorp_exec;
+      break;
+   case 75:
+      device->blorp.exec = gfx75_blorp_exec;
+      break;
+   case 80:
+      device->blorp.exec = gfx8_blorp_exec;
+      break;
+   case 90:
+      device->blorp.exec = gfx9_blorp_exec;
+      break;
+   case 110:
+      device->blorp.exec = gfx11_blorp_exec;
+      break;
+   case 120:
+      device->blorp.exec = gfx12_blorp_exec;
+      break;
+   case 125:
+      device->blorp.exec = gfx125_blorp_exec;
+      break;
+   default:
+      unreachable("Unknown hardware generation");
+   }
+}
+
+void
+anv_device_finish_blorp(struct anv_device *device)
+{
+   blorp_finish(&device->blorp);
+}
+
+static void
+anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer,
+                     struct blorp_batch *batch, enum blorp_batch_flags flags)
+{
+   if (!(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT)) {
+      assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
+      flags |= BLORP_BATCH_USE_COMPUTE;
+   }
+
+   blorp_batch_init(&cmd_buffer->device->blorp, batch, cmd_buffer, flags);
+}
+
+static void
+anv_blorp_batch_finish(struct blorp_batch *batch)
+{
+   blorp_batch_finish(batch);
+}
+
+static void
+get_blorp_surf_for_anv_buffer(struct anv_device *device,
+                              struct anv_buffer *buffer, uint64_t offset,
+                              uint32_t width, uint32_t height,
+                              uint32_t row_pitch, enum isl_format format,
+                              bool is_dest,
+                              struct blorp_surf *blorp_surf,
+                              struct isl_surf *isl_surf)
+{
+   bool ok UNUSED;
+
+   *blorp_surf = (struct blorp_surf) {
+      .surf = isl_surf,
+      .addr = {
+         .buffer = buffer->address.bo,
+         .offset = buffer->address.offset + offset,
+         .mocs = anv_mocs(device, buffer->address.bo,
+                          is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
+                                  : ISL_SURF_USAGE_TEXTURE_BIT),
+      },
+   };
+
+   ok = isl_surf_init(&device->isl_dev, isl_surf,
+                     .dim = ISL_SURF_DIM_2D,
+                     .format = format,
+                     .width = width,
+                     .height = height,
+                     .depth = 1,
+                     .levels = 1,
+                     .array_len = 1,
+                     .samples = 1,
+                     .row_pitch_B = row_pitch,
+                     .usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
+                                      : ISL_SURF_USAGE_TEXTURE_BIT,
+                     .tiling_flags = ISL_TILING_LINEAR_BIT);
+   assert(ok);
+}
+
+/* Pick something high enough that it won't be used in core and low enough it
+ * will never map to an extension.
+ */
+#define ANV_IMAGE_LAYOUT_EXPLICIT_AUX (VkImageLayout)10000000
+
+static struct blorp_address
+anv_to_blorp_address(struct anv_address addr)
+{
+   return (struct blorp_address) {
+      .buffer = addr.bo,
+      .offset = addr.offset,
+   };
+}
+
+static void
+get_blorp_surf_for_anv_image(const struct anv_device *device,
+                             const struct anv_image *image,
+                             VkImageAspectFlags aspect,
+                             VkImageUsageFlags usage,
+                             VkImageLayout layout,
+                             enum isl_aux_usage aux_usage,
+                             struct blorp_surf *blorp_surf)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) {
+      assert(usage != 0);
+      aux_usage = anv_layout_to_aux_usage(device->info, image,
+                                          aspect, usage, layout);
+   }
+
+   isl_surf_usage_flags_t mocs_usage =
+      (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) ?
+      ISL_SURF_USAGE_RENDER_TARGET_BIT : ISL_SURF_USAGE_TEXTURE_BIT;
+
+   const struct anv_surface *surface = &image->planes[plane].primary_surface;
+   const struct anv_address address =
+      anv_image_address(image, &surface->memory_range);
+
+   *blorp_surf = (struct blorp_surf) {
+      .surf = &surface->isl,
+      .addr = {
+         .buffer = address.bo,
+         .offset = address.offset,
+         .mocs = anv_mocs(device, address.bo, mocs_usage),
+      },
+   };
+
+   if (aux_usage != ISL_AUX_USAGE_NONE) {
+      const struct anv_surface *aux_surface = &image->planes[plane].aux_surface;
+      const struct anv_address aux_address =
+         anv_image_address(image, &aux_surface->memory_range);
+
+      blorp_surf->aux_usage = aux_usage;
+      blorp_surf->aux_surf = &aux_surface->isl;
+
+      if (!anv_address_is_null(aux_address)) {
+         blorp_surf->aux_addr = (struct blorp_address) {
+            .buffer = aux_address.bo,
+            .offset = aux_address.offset,
+            .mocs = anv_mocs(device, aux_address.bo, 0),
+         };
+      }
+
+      /* If we're doing a partial resolve, then we need the indirect clear
+       * color.  If we are doing a fast clear and want to store/update the
+       * clear color, we also pass the address to blorp, otherwise it will only
+       * stomp the CCS to a particular value and won't care about format or
+       * clear value
+       */
+      if (aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
+         const struct anv_address clear_color_addr =
+            anv_image_get_clear_color_addr(device, image, aspect);
+         blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr);
+      } else if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         const struct anv_address clear_color_addr =
+            anv_image_get_clear_color_addr(device, image, aspect);
+         blorp_surf->clear_color_addr = anv_to_blorp_address(clear_color_addr);
+         blorp_surf->clear_color = (union isl_color_value) {
+            .f32 = { ANV_HZ_FC_VAL },
+         };
+      }
+   }
+}
+
+static bool
+get_blorp_surf_for_anv_shadow_image(const struct anv_device *device,
+                                    const struct anv_image *image,
+                                    VkImageAspectFlags aspect,
+                                    struct blorp_surf *blorp_surf)
+{
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   if (!anv_surface_is_valid(&image->planes[plane].shadow_surface))
+      return false;
+
+   const struct anv_surface *surface = &image->planes[plane].shadow_surface;
+   const struct anv_address address =
+      anv_image_address(image, &surface->memory_range);
+
+   *blorp_surf = (struct blorp_surf) {
+      .surf = &surface->isl,
+      .addr = {
+         .buffer = address.bo,
+         .offset = address.offset,
+         .mocs = anv_mocs(device, address.bo, ISL_SURF_USAGE_RENDER_TARGET_BIT),
+      },
+   };
+
+   return true;
+}
+
+static void
+copy_image(struct anv_cmd_buffer *cmd_buffer,
+           struct blorp_batch *batch,
+           struct anv_image *src_image,
+           VkImageLayout src_image_layout,
+           struct anv_image *dst_image,
+           VkImageLayout dst_image_layout,
+           const VkImageCopy2 *region)
+{
+   VkOffset3D srcOffset =
+      vk_image_sanitize_offset(&src_image->vk, region->srcOffset);
+   VkOffset3D dstOffset =
+      vk_image_sanitize_offset(&dst_image->vk, region->dstOffset);
+   VkExtent3D extent =
+      vk_image_sanitize_extent(&src_image->vk, region->extent);
+
+   const uint32_t dst_level = region->dstSubresource.mipLevel;
+   unsigned dst_base_layer, layer_count;
+   if (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) {
+      dst_base_layer = region->dstOffset.z;
+      layer_count = region->extent.depth;
+   } else {
+      dst_base_layer = region->dstSubresource.baseArrayLayer;
+      layer_count = vk_image_subresource_layer_count(&dst_image->vk,
+                                                     &region->dstSubresource);
+   }
+
+   const uint32_t src_level = region->srcSubresource.mipLevel;
+   unsigned src_base_layer;
+   if (src_image->vk.image_type == VK_IMAGE_TYPE_3D) {
+      src_base_layer = region->srcOffset.z;
+   } else {
+      src_base_layer = region->srcSubresource.baseArrayLayer;
+      assert(layer_count ==
+             vk_image_subresource_layer_count(&src_image->vk,
+                                              &region->srcSubresource));
+   }
+
+   VkImageAspectFlags src_mask = region->srcSubresource.aspectMask,
+      dst_mask = region->dstSubresource.aspectMask;
+
+   assert(anv_image_aspects_compatible(src_mask, dst_mask));
+
+   if (util_bitcount(src_mask) > 1) {
+      anv_foreach_image_aspect_bit(aspect_bit, src_image, src_mask) {
+         struct blorp_surf src_surf, dst_surf;
+         get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                      src_image, 1UL << aspect_bit,
+                                      VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                      src_image_layout, ISL_AUX_USAGE_NONE,
+                                      &src_surf);
+         get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                      dst_image, 1UL << aspect_bit,
+                                      VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                      dst_image_layout, ISL_AUX_USAGE_NONE,
+                                      &dst_surf);
+         anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image,
+                                           1UL << aspect_bit,
+                                           dst_surf.aux_usage, dst_level,
+                                           dst_base_layer, layer_count);
+
+         for (unsigned i = 0; i < layer_count; i++) {
+            blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
+                       &dst_surf, dst_level, dst_base_layer + i,
+                       srcOffset.x, srcOffset.y,
+                       dstOffset.x, dstOffset.y,
+                       extent.width, extent.height);
+         }
+
+         struct blorp_surf dst_shadow_surf;
+         if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
+                                                 dst_image,
+                                                 1UL << aspect_bit,
+                                                 &dst_shadow_surf)) {
+            for (unsigned i = 0; i < layer_count; i++) {
+               blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
+                          &dst_shadow_surf, dst_level, dst_base_layer + i,
+                          srcOffset.x, srcOffset.y,
+                          dstOffset.x, dstOffset.y,
+                          extent.width, extent.height);
+            }
+         }
+      }
+   } else {
+      struct blorp_surf src_surf, dst_surf;
+      get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, src_mask,
+                                   VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                   src_image_layout, ISL_AUX_USAGE_NONE,
+                                   &src_surf);
+      get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, dst_mask,
+                                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                   dst_image_layout, ISL_AUX_USAGE_NONE,
+                                   &dst_surf);
+      anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image, dst_mask,
+                                        dst_surf.aux_usage, dst_level,
+                                        dst_base_layer, layer_count);
+
+      for (unsigned i = 0; i < layer_count; i++) {
+         blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
+                    &dst_surf, dst_level, dst_base_layer + i,
+                    srcOffset.x, srcOffset.y,
+                    dstOffset.x, dstOffset.y,
+                    extent.width, extent.height);
+      }
+
+      struct blorp_surf dst_shadow_surf;
+      if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
+                                              dst_image, dst_mask,
+                                              &dst_shadow_surf)) {
+         for (unsigned i = 0; i < layer_count; i++) {
+            blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
+                       &dst_shadow_surf, dst_level, dst_base_layer + i,
+                       srcOffset.x, srcOffset.y,
+                       dstOffset.x, dstOffset.y,
+                       extent.width, extent.height);
+         }
+      }
+   }
+}
+
+void anv_CmdCopyImage2(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyImageInfo2*                     pCopyImageInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, pCopyImageInfo->srcImage);
+   ANV_FROM_HANDLE(anv_image, dst_image, pCopyImageInfo->dstImage);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
+      copy_image(cmd_buffer, &batch,
+                 src_image, pCopyImageInfo->srcImageLayout,
+                 dst_image, pCopyImageInfo->dstImageLayout,
+                 &pCopyImageInfo->pRegions[r]);
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+static enum isl_format
+isl_format_for_size(unsigned size_B)
+{
+   /* Prefer 32-bit per component formats for CmdFillBuffer */
+   switch (size_B) {
+   case 1:  return ISL_FORMAT_R8_UINT;
+   case 2:  return ISL_FORMAT_R16_UINT;
+   case 3:  return ISL_FORMAT_R8G8B8_UINT;
+   case 4:  return ISL_FORMAT_R32_UINT;
+   case 6:  return ISL_FORMAT_R16G16B16_UINT;
+   case 8:  return ISL_FORMAT_R32G32_UINT;
+   case 12: return ISL_FORMAT_R32G32B32_UINT;
+   case 16: return ISL_FORMAT_R32G32B32A32_UINT;
+   default:
+      unreachable("Unknown format size");
+   }
+}
+
+static void
+copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
+                     struct blorp_batch *batch,
+                     struct anv_buffer *anv_buffer,
+                     struct anv_image *anv_image,
+                     VkImageLayout image_layout,
+                     const VkBufferImageCopy2* region,
+                     bool buffer_to_image)
+{
+   struct {
+      struct blorp_surf surf;
+      uint32_t level;
+      VkOffset3D offset;
+   } image, buffer, *src, *dst;
+
+   buffer.level = 0;
+   buffer.offset = (VkOffset3D) { 0, 0, 0 };
+
+   if (buffer_to_image) {
+      src = &buffer;
+      dst = &image;
+   } else {
+      src = &image;
+      dst = &buffer;
+   }
+
+   const VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+
+   get_blorp_surf_for_anv_image(cmd_buffer->device, anv_image, aspect,
+                                buffer_to_image ?
+                                VK_IMAGE_USAGE_TRANSFER_DST_BIT :
+                                VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                image_layout, ISL_AUX_USAGE_NONE,
+                                &image.surf);
+   image.offset =
+      vk_image_sanitize_offset(&anv_image->vk, region->imageOffset);
+   image.level = region->imageSubresource.mipLevel;
+
+   VkExtent3D extent =
+      vk_image_sanitize_extent(&anv_image->vk, region->imageExtent);
+   if (anv_image->vk.image_type != VK_IMAGE_TYPE_3D) {
+      image.offset.z = region->imageSubresource.baseArrayLayer;
+      extent.depth =
+         vk_image_subresource_layer_count(&anv_image->vk,
+                                          &region->imageSubresource);
+   }
+
+   const enum isl_format linear_format =
+      anv_get_isl_format(cmd_buffer->device->info, anv_image->vk.format,
+                         aspect, VK_IMAGE_TILING_LINEAR);
+   const struct isl_format_layout *linear_fmtl =
+      isl_format_get_layout(linear_format);
+
+   const struct vk_image_buffer_layout buffer_layout =
+      vk_image_buffer_copy_layout(&anv_image->vk, region);
+
+   /* Some formats have additional restrictions which may cause ISL to
+    * fail to create a surface for us.  For example, YCbCr formats
+    * have to have 2-pixel aligned strides.
+    *
+    * To avoid these issues, we always bind the buffer as if it's a
+    * "normal" format like RGBA32_UINT.  Since we're using blorp_copy,
+    * the format doesn't matter as long as it has the right bpb.
+    */
+   const VkExtent2D buffer_extent = {
+      .width = DIV_ROUND_UP(extent.width, linear_fmtl->bw),
+      .height = DIV_ROUND_UP(extent.height, linear_fmtl->bh),
+   };
+   const enum isl_format buffer_format =
+      isl_format_for_size(linear_fmtl->bpb / 8);
+
+   struct isl_surf buffer_isl_surf;
+   get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+                                 anv_buffer, region->bufferOffset,
+                                 buffer_extent.width, buffer_extent.height,
+                                 buffer_layout.row_stride_B, buffer_format,
+                                 false, &buffer.surf, &buffer_isl_surf);
+
+   bool dst_has_shadow = false;
+   struct blorp_surf dst_shadow_surf;
+   if (&image == dst) {
+      /* In this case, the source is the buffer and, since blorp takes its
+       * copy dimensions in terms of the source format, we have to use the
+       * scaled down version for compressed textures because the source
+       * format is an RGB format.
+       */
+      extent.width = buffer_extent.width;
+      extent.height = buffer_extent.height;
+
+      anv_cmd_buffer_mark_image_written(cmd_buffer, anv_image,
+                                        aspect, dst->surf.aux_usage,
+                                        dst->level,
+                                        dst->offset.z, extent.depth);
+
+      dst_has_shadow =
+         get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
+                                             anv_image, aspect,
+                                             &dst_shadow_surf);
+   }
+
+   for (unsigned z = 0; z < extent.depth; z++) {
+      blorp_copy(batch, &src->surf, src->level, src->offset.z,
+                 &dst->surf, dst->level, dst->offset.z,
+                 src->offset.x, src->offset.y, dst->offset.x, dst->offset.y,
+                 extent.width, extent.height);
+
+      if (dst_has_shadow) {
+         blorp_copy(batch, &src->surf, src->level, src->offset.z,
+                    &dst_shadow_surf, dst->level, dst->offset.z,
+                    src->offset.x, src->offset.y,
+                    dst->offset.x, dst->offset.y,
+                    extent.width, extent.height);
+      }
+
+      image.offset.z++;
+      buffer.surf.addr.offset += buffer_layout.image_stride_B;
+   }
+}
+
+void anv_CmdCopyBufferToImage2(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyBufferToImageInfo2*             pCopyBufferToImageInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
+   ANV_FROM_HANDLE(anv_image, dst_image, pCopyBufferToImageInfo->dstImage);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
+      copy_buffer_to_image(cmd_buffer, &batch, src_buffer, dst_image,
+                           pCopyBufferToImageInfo->dstImageLayout,
+                           &pCopyBufferToImageInfo->pRegions[r], true);
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void anv_CmdCopyImageToBuffer2(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyImageToBufferInfo2*             pCopyImageToBufferInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, pCopyImageToBufferInfo->srcImage);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < pCopyImageToBufferInfo->regionCount; r++) {
+      copy_buffer_to_image(cmd_buffer, &batch, dst_buffer, src_image,
+                           pCopyImageToBufferInfo->srcImageLayout,
+                           &pCopyImageToBufferInfo->pRegions[r], false);
+   }
+
+   anv_blorp_batch_finish(&batch);
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+}
+
+static bool
+flip_coords(unsigned *src0, unsigned *src1, unsigned *dst0, unsigned *dst1)
+{
+   bool flip = false;
+   if (*src0 > *src1) {
+      unsigned tmp = *src0;
+      *src0 = *src1;
+      *src1 = tmp;
+      flip = !flip;
+   }
+
+   if (*dst0 > *dst1) {
+      unsigned tmp = *dst0;
+      *dst0 = *dst1;
+      *dst1 = tmp;
+      flip = !flip;
+   }
+
+   return flip;
+}
+
+static void
+blit_image(struct anv_cmd_buffer *cmd_buffer,
+           struct blorp_batch *batch,
+           struct anv_image *src_image,
+           VkImageLayout src_image_layout,
+           struct anv_image *dst_image,
+           VkImageLayout dst_image_layout,
+           const VkImageBlit2 *region,
+           VkFilter filter)
+{
+   const VkImageSubresourceLayers *src_res = &region->srcSubresource;
+   const VkImageSubresourceLayers *dst_res = &region->dstSubresource;
+
+   struct blorp_surf src, dst;
+
+   enum blorp_filter blorp_filter;
+   switch (filter) {
+   case VK_FILTER_NEAREST:
+      blorp_filter = BLORP_FILTER_NEAREST;
+      break;
+   case VK_FILTER_LINEAR:
+      blorp_filter = BLORP_FILTER_BILINEAR;
+      break;
+   default:
+      unreachable("Invalid filter");
+   }
+
+   assert(anv_image_aspects_compatible(src_res->aspectMask,
+                                       dst_res->aspectMask));
+
+   anv_foreach_image_aspect_bit(aspect_bit, src_image, src_res->aspectMask) {
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   src_image, 1U << aspect_bit,
+                                   VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                   src_image_layout, ISL_AUX_USAGE_NONE, &src);
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   dst_image, 1U << aspect_bit,
+                                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                   dst_image_layout, ISL_AUX_USAGE_NONE, &dst);
+
+      struct anv_format_plane src_format =
+         anv_get_format_aspect(cmd_buffer->device->info, src_image->vk.format,
+                               1U << aspect_bit, src_image->vk.tiling);
+      struct anv_format_plane dst_format =
+         anv_get_format_aspect(cmd_buffer->device->info, dst_image->vk.format,
+                               1U << aspect_bit, dst_image->vk.tiling);
+
+      unsigned dst_start, dst_end;
+      if (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         assert(dst_res->baseArrayLayer == 0);
+         dst_start = region->dstOffsets[0].z;
+         dst_end = region->dstOffsets[1].z;
+      } else {
+         dst_start = dst_res->baseArrayLayer;
+         dst_end = dst_start +
+            vk_image_subresource_layer_count(&dst_image->vk, dst_res);
+      }
+
+      unsigned src_start, src_end;
+      if (src_image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         assert(src_res->baseArrayLayer == 0);
+         src_start = region->srcOffsets[0].z;
+         src_end = region->srcOffsets[1].z;
+      } else {
+         src_start = src_res->baseArrayLayer;
+         src_end = src_start +
+            vk_image_subresource_layer_count(&src_image->vk, src_res);
+      }
+
+      bool flip_z = flip_coords(&src_start, &src_end, &dst_start, &dst_end);
+      const unsigned num_layers = dst_end - dst_start;
+      float src_z_step = (float)(src_end - src_start) / (float)num_layers;
+
+      /* There is no interpolation to the pixel center during rendering, so
+       * add the 0.5 offset ourselves here. */
+      float depth_center_offset = 0;
+      if (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
+         depth_center_offset = 0.5 / num_layers * (src_end - src_start);
+
+      if (flip_z) {
+         src_start = src_end;
+         src_z_step *= -1;
+         depth_center_offset *= -1;
+      }
+
+      unsigned src_x0 = region->srcOffsets[0].x;
+      unsigned src_x1 = region->srcOffsets[1].x;
+      unsigned dst_x0 = region->dstOffsets[0].x;
+      unsigned dst_x1 = region->dstOffsets[1].x;
+      bool flip_x = flip_coords(&src_x0, &src_x1, &dst_x0, &dst_x1);
+
+      unsigned src_y0 = region->srcOffsets[0].y;
+      unsigned src_y1 = region->srcOffsets[1].y;
+      unsigned dst_y0 = region->dstOffsets[0].y;
+      unsigned dst_y1 = region->dstOffsets[1].y;
+      bool flip_y = flip_coords(&src_y0, &src_y1, &dst_y0, &dst_y1);
+
+      anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image,
+                                        1U << aspect_bit,
+                                        dst.aux_usage,
+                                        dst_res->mipLevel,
+                                        dst_start, num_layers);
+
+      for (unsigned i = 0; i < num_layers; i++) {
+         unsigned dst_z = dst_start + i;
+         float src_z = src_start + i * src_z_step + depth_center_offset;
+
+         blorp_blit(batch, &src, src_res->mipLevel, src_z,
+                    src_format.isl_format, src_format.swizzle,
+                    &dst, dst_res->mipLevel, dst_z,
+                    dst_format.isl_format, dst_format.swizzle,
+                    src_x0, src_y0, src_x1, src_y1,
+                    dst_x0, dst_y0, dst_x1, dst_y1,
+                    blorp_filter, flip_x, flip_y);
+      }
+   }
+}
+
+void anv_CmdBlitImage2(
+    VkCommandBuffer                             commandBuffer,
+    const VkBlitImageInfo2*                     pBlitImageInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, pBlitImageInfo->srcImage);
+   ANV_FROM_HANDLE(anv_image, dst_image, pBlitImageInfo->dstImage);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < pBlitImageInfo->regionCount; r++) {
+      blit_image(cmd_buffer, &batch,
+                 src_image, pBlitImageInfo->srcImageLayout,
+                 dst_image, pBlitImageInfo->dstImageLayout,
+                 &pBlitImageInfo->pRegions[r], pBlitImageInfo->filter);
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+/**
+ * Returns the greatest common divisor of a and b that is a power of two.
+ */
+static uint64_t
+gcd_pow2_u64(uint64_t a, uint64_t b)
+{
+   assert(a > 0 || b > 0);
+
+   unsigned a_log2 = ffsll(a) - 1;
+   unsigned b_log2 = ffsll(b) - 1;
+
+   /* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which
+    * case, the MIN2() will take the other one.  If both are 0 then we will
+    * hit the assert above.
+    */
+   return 1 << MIN2(a_log2, b_log2);
+}
+
+/* This is maximum possible width/height our HW can handle */
+#define MAX_SURFACE_DIM (1ull << 14)
+
+static void
+copy_buffer(struct anv_device *device,
+            struct blorp_batch *batch,
+            struct anv_buffer *src_buffer,
+            struct anv_buffer *dst_buffer,
+            const VkBufferCopy2 *region)
+{
+   struct blorp_address src = {
+      .buffer = src_buffer->address.bo,
+      .offset = src_buffer->address.offset + region->srcOffset,
+      .mocs = anv_mocs(device, src_buffer->address.bo,
+                       ISL_SURF_USAGE_TEXTURE_BIT),
+   };
+   struct blorp_address dst = {
+      .buffer = dst_buffer->address.bo,
+      .offset = dst_buffer->address.offset + region->dstOffset,
+      .mocs = anv_mocs(device, dst_buffer->address.bo,
+                       ISL_SURF_USAGE_RENDER_TARGET_BIT),
+   };
+
+   blorp_buffer_copy(batch, src, dst, region->size);
+}
+
+void anv_CmdCopyBuffer2(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyBufferInfo2*                    pCopyBufferInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) {
+      copy_buffer(cmd_buffer->device, &batch, src_buffer, dst_buffer,
+                  &pCopyBufferInfo->pRegions[r]);
+   }
+
+   anv_blorp_batch_finish(&batch);
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+}
+
+
+void anv_CmdUpdateBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                dataSize,
+    const void*                                 pData)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   /* We can't quite grab a full block because the state stream needs a
+    * little data at the top to build its linked list.
+    */
+   const uint32_t max_update_size =
+      cmd_buffer->device->dynamic_state_pool.block_size - 64;
+
+   assert(max_update_size < MAX_SURFACE_DIM * 4);
+
+   /* We're about to read data that was written from the CPU.  Flush the
+    * texture cache so we don't get anything stale.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+                             "before UpdateBuffer");
+
+   while (dataSize) {
+      const uint32_t copy_size = MIN2(dataSize, max_update_size);
+
+      struct anv_state tmp_data =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, copy_size, 64);
+
+      memcpy(tmp_data.map, pData, copy_size);
+
+      struct blorp_address src = {
+         .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         .offset = tmp_data.offset,
+         .mocs = isl_mocs(&cmd_buffer->device->isl_dev,
+                          ISL_SURF_USAGE_TEXTURE_BIT, false)
+      };
+      struct blorp_address dst = {
+         .buffer = dst_buffer->address.bo,
+         .offset = dst_buffer->address.offset + dstOffset,
+         .mocs = anv_mocs(cmd_buffer->device, dst_buffer->address.bo,
+                          ISL_SURF_USAGE_RENDER_TARGET_BIT),
+      };
+
+      blorp_buffer_copy(&batch, src, dst, copy_size);
+
+      dataSize -= copy_size;
+      dstOffset += copy_size;
+      pData = (void *)pData + copy_size;
+   }
+
+   anv_blorp_batch_finish(&batch);
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+}
+
+void anv_CmdFillBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                fillSize,
+    uint32_t                                    data)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+   struct blorp_surf surf;
+   struct isl_surf isl_surf;
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   fillSize = vk_buffer_range(&dst_buffer->vk, dstOffset, fillSize);
+
+   /* From the Vulkan spec:
+    *
+    *    "size is the number of bytes to fill, and must be either a multiple
+    *    of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
+    *    the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
+    *    buffer is not a multiple of 4, then the nearest smaller multiple is
+    *    used."
+    */
+   fillSize &= ~3ull;
+
+   /* First, we compute the biggest format that can be used with the
+    * given offsets and size.
+    */
+   int bs = 16;
+   bs = gcd_pow2_u64(bs, dstOffset);
+   bs = gcd_pow2_u64(bs, fillSize);
+   enum isl_format isl_format = isl_format_for_size(bs);
+
+   union isl_color_value color = {
+      .u32 = { data, data, data, data },
+   };
+
+   const uint64_t max_fill_size = MAX_SURFACE_DIM * MAX_SURFACE_DIM * bs;
+   while (fillSize >= max_fill_size) {
+      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+                                    dst_buffer, dstOffset,
+                                    MAX_SURFACE_DIM, MAX_SURFACE_DIM,
+                                    MAX_SURFACE_DIM * bs, isl_format, true,
+                                    &surf, &isl_surf);
+
+      blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
+                  0, 0, 1, 0, 0, MAX_SURFACE_DIM, MAX_SURFACE_DIM,
+                  color, 0 /* color_write_disable */);
+      fillSize -= max_fill_size;
+      dstOffset += max_fill_size;
+   }
+
+   uint64_t height = fillSize / (MAX_SURFACE_DIM * bs);
+   assert(height < MAX_SURFACE_DIM);
+   if (height != 0) {
+      const uint64_t rect_fill_size = height * MAX_SURFACE_DIM * bs;
+      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+                                    dst_buffer, dstOffset,
+                                    MAX_SURFACE_DIM, height,
+                                    MAX_SURFACE_DIM * bs, isl_format, true,
+                                    &surf, &isl_surf);
+
+      blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
+                  0, 0, 1, 0, 0, MAX_SURFACE_DIM, height,
+                  color, 0 /* color_write_disable */);
+      fillSize -= rect_fill_size;
+      dstOffset += rect_fill_size;
+   }
+
+   if (fillSize != 0) {
+      const uint32_t width = fillSize / bs;
+      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+                                    dst_buffer, dstOffset,
+                                    width, 1,
+                                    width * bs, isl_format, true,
+                                    &surf, &isl_surf);
+
+      blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
+                  0, 0, 1, 0, 0, width, 1,
+                  color, 0 /* color_write_disable */);
+   }
+
+   anv_blorp_batch_finish(&batch);
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+}
+
+void anv_CmdClearColorImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     _image,
+    VkImageLayout                               imageLayout,
+    const VkClearColorValue*                    pColor,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   for (unsigned r = 0; r < rangeCount; r++) {
+      if (pRanges[r].aspectMask == 0)
+         continue;
+
+      assert(pRanges[r].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+
+      struct blorp_surf surf;
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, pRanges[r].aspectMask,
+                                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                   imageLayout, ISL_AUX_USAGE_NONE, &surf);
+
+      struct anv_format_plane src_format =
+         anv_get_format_aspect(cmd_buffer->device->info, image->vk.format,
+                               VK_IMAGE_ASPECT_COLOR_BIT, image->vk.tiling);
+
+      unsigned base_layer = pRanges[r].baseArrayLayer;
+      uint32_t layer_count =
+         vk_image_subresource_layer_count(&image->vk, &pRanges[r]);
+      uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, &pRanges[r]);
+
+      for (uint32_t i = 0; i < level_count; i++) {
+         const unsigned level = pRanges[r].baseMipLevel + i;
+         const unsigned level_width = anv_minify(image->vk.extent.width, level);
+         const unsigned level_height = anv_minify(image->vk.extent.height, level);
+
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+            base_layer = 0;
+            layer_count = anv_minify(image->vk.extent.depth, level);
+         }
+
+         anv_cmd_buffer_mark_image_written(cmd_buffer, image,
+                                           pRanges[r].aspectMask,
+                                           surf.aux_usage, level,
+                                           base_layer, layer_count);
+
+         blorp_clear(&batch, &surf,
+                     src_format.isl_format, src_format.swizzle,
+                     level, base_layer, layer_count,
+                     0, 0, level_width, level_height,
+                     vk_to_isl_color(*pColor), 0 /* color_write_disable */);
+      }
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void anv_CmdClearDepthStencilImage(
+    VkCommandBuffer                             commandBuffer,
+    VkImage                                     image_h,
+    VkImageLayout                               imageLayout,
+    const VkClearDepthStencilValue*             pDepthStencil,
+    uint32_t                                    rangeCount,
+    const VkImageSubresourceRange*              pRanges)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, image, image_h);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf depth, stencil, stencil_shadow;
+   if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                   imageLayout, ISL_AUX_USAGE_NONE, &depth);
+   } else {
+      memset(&depth, 0, sizeof(depth));
+   }
+
+   bool has_stencil_shadow = false;
+   if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                   imageLayout, ISL_AUX_USAGE_NONE, &stencil);
+
+      has_stencil_shadow =
+         get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
+                                             VK_IMAGE_ASPECT_STENCIL_BIT,
+                                             &stencil_shadow);
+   } else {
+      memset(&stencil, 0, sizeof(stencil));
+   }
+
+   for (unsigned r = 0; r < rangeCount; r++) {
+      if (pRanges[r].aspectMask == 0)
+         continue;
+
+      bool clear_depth = pRanges[r].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
+      bool clear_stencil = pRanges[r].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      unsigned base_layer = pRanges[r].baseArrayLayer;
+      uint32_t layer_count =
+         vk_image_subresource_layer_count(&image->vk, &pRanges[r]);
+      uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, &pRanges[r]);
+
+      for (uint32_t i = 0; i < level_count; i++) {
+         const unsigned level = pRanges[r].baseMipLevel + i;
+         const unsigned level_width = anv_minify(image->vk.extent.width, level);
+         const unsigned level_height = anv_minify(image->vk.extent.height, level);
+
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+            layer_count = anv_minify(image->vk.extent.depth, level);
+
+         blorp_clear_depth_stencil(&batch, &depth, &stencil,
+                                   level, base_layer, layer_count,
+                                   0, 0, level_width, level_height,
+                                   clear_depth, pDepthStencil->depth,
+                                   clear_stencil ? 0xff : 0,
+                                   pDepthStencil->stencil);
+
+         if (clear_stencil && has_stencil_shadow) {
+            union isl_color_value stencil_color = {
+               .u32 = { pDepthStencil->stencil, },
+            };
+            blorp_clear(&batch, &stencil_shadow,
+                        ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
+                        level, base_layer, layer_count,
+                        0, 0, level_width, level_height,
+                        stencil_color, 0 /* color_write_disable */);
+         }
+      }
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+VkResult
+anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                         uint32_t num_entries,
+                                         uint32_t *state_offset,
+                                         struct anv_state *bt_state)
+{
+   *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries,
+                                                  state_offset);
+   if (bt_state->map == NULL) {
+      /* We ran out of space.  Grab a new binding table block. */
+      VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return result;
+
+      /* Re-emit state base addresses so we get the new surface state base
+       * address before we start emitting binding tables etc.
+       */
+      anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+
+      *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries,
+                                                     state_offset);
+      assert(bt_state->map != NULL);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+binding_table_for_surface_state(struct anv_cmd_buffer *cmd_buffer,
+                                struct anv_state surface_state,
+                                uint32_t *bt_offset)
+{
+   uint32_t state_offset;
+   struct anv_state bt_state;
+
+   VkResult result =
+      anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, 1, &state_offset,
+                                               &bt_state);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint32_t *bt_map = bt_state.map;
+   bt_map[0] = surface_state.offset + state_offset;
+
+   *bt_offset = bt_state.offset;
+   return VK_SUCCESS;
+}
+
+static void
+clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
+                       struct blorp_batch *batch,
+                       const VkClearAttachment *attachment,
+                       uint32_t rectCount, const VkClearRect *pRects)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const uint32_t att_idx = attachment->colorAttachment;
+   assert(att_idx < gfx->color_att_count);
+   const struct anv_attachment *att = &gfx->color_att[att_idx];
+
+   if (att->vk_format == VK_FORMAT_UNDEFINED)
+      return;
+
+   uint32_t binding_table;
+   VkResult result =
+      binding_table_for_surface_state(cmd_buffer, att->surface_state.state,
+                                      &binding_table);
+   if (result != VK_SUCCESS)
+      return;
+
+   union isl_color_value clear_color =
+      vk_to_isl_color(attachment->clearValue.color);
+
+   /* If multiview is enabled we ignore baseArrayLayer and layerCount */
+   if (gfx->view_mask) {
+      u_foreach_bit(view_idx, gfx->view_mask) {
+         for (uint32_t r = 0; r < rectCount; ++r) {
+            const VkOffset2D offset = pRects[r].rect.offset;
+            const VkExtent2D extent = pRects[r].rect.extent;
+            blorp_clear_attachments(batch, binding_table,
+                                    ISL_FORMAT_UNSUPPORTED,
+                                    gfx->samples,
+                                    view_idx, 1,
+                                    offset.x, offset.y,
+                                    offset.x + extent.width,
+                                    offset.y + extent.height,
+                                    true, clear_color, false, 0.0f, 0, 0);
+         }
+      }
+      return;
+   }
+
+   for (uint32_t r = 0; r < rectCount; ++r) {
+      const VkOffset2D offset = pRects[r].rect.offset;
+      const VkExtent2D extent = pRects[r].rect.extent;
+      assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
+      blorp_clear_attachments(batch, binding_table,
+                              ISL_FORMAT_UNSUPPORTED,
+                              gfx->samples,
+                              pRects[r].baseArrayLayer,
+                              pRects[r].layerCount,
+                              offset.x, offset.y,
+                              offset.x + extent.width, offset.y + extent.height,
+                              true, clear_color, false, 0.0f, 0, 0);
+   }
+}
+
+static void
+clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
+                               struct blorp_batch *batch,
+                               const VkClearAttachment *attachment,
+                               uint32_t rectCount, const VkClearRect *pRects)
+{
+   static const union isl_color_value color_value = { .u32 = { 0, } };
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_attachment *d_att = &gfx->depth_att;
+   const struct anv_attachment *s_att = &gfx->stencil_att;
+   if (d_att->vk_format == VK_FORMAT_UNDEFINED &&
+       s_att->vk_format == VK_FORMAT_UNDEFINED)
+      return;
+
+   bool clear_depth = attachment->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
+   bool clear_stencil = attachment->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
+
+   enum isl_format depth_format = ISL_FORMAT_UNSUPPORTED;
+   if (d_att->vk_format != VK_FORMAT_UNDEFINED) {
+      depth_format = anv_get_isl_format(cmd_buffer->device->info,
+                                        d_att->vk_format,
+                                        VK_IMAGE_ASPECT_DEPTH_BIT,
+                                        VK_IMAGE_TILING_OPTIMAL);
+   }
+
+   uint32_t binding_table;
+   VkResult result =
+      binding_table_for_surface_state(cmd_buffer,
+                                      gfx->null_surface_state,
+                                      &binding_table);
+   if (result != VK_SUCCESS)
+      return;
+
+   /* If multiview is enabled we ignore baseArrayLayer and layerCount */
+   if (gfx->view_mask) {
+      u_foreach_bit(view_idx, gfx->view_mask) {
+         for (uint32_t r = 0; r < rectCount; ++r) {
+            const VkOffset2D offset = pRects[r].rect.offset;
+            const VkExtent2D extent = pRects[r].rect.extent;
+            VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
+            blorp_clear_attachments(batch, binding_table,
+                                    depth_format,
+                                    gfx->samples,
+                                    view_idx, 1,
+                                    offset.x, offset.y,
+                                    offset.x + extent.width,
+                                    offset.y + extent.height,
+                                    false, color_value,
+                                    clear_depth, value.depth,
+                                    clear_stencil ? 0xff : 0, value.stencil);
+         }
+      }
+      return;
+   }
+
+   for (uint32_t r = 0; r < rectCount; ++r) {
+      const VkOffset2D offset = pRects[r].rect.offset;
+      const VkExtent2D extent = pRects[r].rect.extent;
+      VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
+      assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
+      blorp_clear_attachments(batch, binding_table,
+                              depth_format,
+                              gfx->samples,
+                              pRects[r].baseArrayLayer,
+                              pRects[r].layerCount,
+                              offset.x, offset.y,
+                              offset.x + extent.width, offset.y + extent.height,
+                              false, color_value,
+                              clear_depth, value.depth,
+                              clear_stencil ? 0xff : 0, value.stencil);
+   }
+}
+
+void anv_CmdClearAttachments(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    attachmentCount,
+    const VkClearAttachment*                    pAttachments,
+    uint32_t                                    rectCount,
+    const VkClearRect*                          pRects)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   /* Because this gets called within a render pass, we tell blorp not to
+    * trash our depth and stencil buffers.
+    */
+   struct blorp_batch batch;
+   enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL;
+   if (cmd_buffer->state.conditional_render_enabled) {
+      anv_cmd_emit_conditional_render_predicate(cmd_buffer);
+      flags |= BLORP_BATCH_PREDICATE_ENABLE;
+   }
+   anv_blorp_batch_init(cmd_buffer, &batch, flags);
+
+   for (uint32_t a = 0; a < attachmentCount; ++a) {
+      if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
+         assert(pAttachments[a].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+         clear_color_attachment(cmd_buffer, &batch,
+                                &pAttachments[a],
+                                rectCount, pRects);
+      } else {
+         clear_depth_stencil_attachment(cmd_buffer, &batch,
+                                        &pAttachments[a],
+                                        rectCount, pRects);
+      }
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+enum subpass_stage {
+   SUBPASS_STAGE_LOAD,
+   SUBPASS_STAGE_DRAW,
+   SUBPASS_STAGE_RESOLVE,
+};
+
+void
+anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+                       const struct anv_image *src_image,
+                       enum isl_aux_usage src_aux_usage,
+                       uint32_t src_level, uint32_t src_base_layer,
+                       const struct anv_image *dst_image,
+                       enum isl_aux_usage dst_aux_usage,
+                       uint32_t dst_level, uint32_t dst_base_layer,
+                       VkImageAspectFlagBits aspect,
+                       uint32_t src_x, uint32_t src_y,
+                       uint32_t dst_x, uint32_t dst_y,
+                       uint32_t width, uint32_t height,
+                       uint32_t layer_count,
+                       enum blorp_filter filter)
+{
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   assert(src_image->vk.image_type == VK_IMAGE_TYPE_2D);
+   assert(src_image->vk.samples > 1);
+   assert(dst_image->vk.image_type == VK_IMAGE_TYPE_2D);
+   assert(dst_image->vk.samples == 1);
+
+   struct blorp_surf src_surf, dst_surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, aspect,
+                                VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                src_aux_usage, &src_surf);
+   if (src_aux_usage == ISL_AUX_USAGE_MCS) {
+      src_surf.clear_color_addr = anv_to_blorp_address(
+         anv_image_get_clear_color_addr(cmd_buffer->device, src_image,
+                                        VK_IMAGE_ASPECT_COLOR_BIT));
+   }
+   get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, aspect,
+                                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                dst_aux_usage, &dst_surf);
+   anv_cmd_buffer_mark_image_written(cmd_buffer, dst_image,
+                                     aspect, dst_aux_usage,
+                                     dst_level, dst_base_layer, layer_count);
+
+   if (filter == BLORP_FILTER_NONE) {
+      /* If no explicit filter is provided, then it's implied by the type of
+       * the source image.
+       */
+      if ((src_surf.surf->usage & ISL_SURF_USAGE_DEPTH_BIT) ||
+          (src_surf.surf->usage & ISL_SURF_USAGE_STENCIL_BIT) ||
+          isl_format_has_int_channel(src_surf.surf->format)) {
+         filter = BLORP_FILTER_SAMPLE_0;
+      } else {
+         filter = BLORP_FILTER_AVERAGE;
+      }
+   }
+
+   for (uint32_t l = 0; l < layer_count; l++) {
+      blorp_blit(&batch,
+                 &src_surf, src_level, src_base_layer + l,
+                 ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+                 &dst_surf, dst_level, dst_base_layer + l,
+                 ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+                 src_x, src_y, src_x + width, src_y + height,
+                 dst_x, dst_y, dst_x + width, dst_y + height,
+                 filter, false, false);
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+static void
+resolve_image(struct anv_cmd_buffer *cmd_buffer,
+              struct anv_image *src_image,
+              VkImageLayout src_image_layout,
+              struct anv_image *dst_image,
+              VkImageLayout dst_image_layout,
+              const VkImageResolve2 *region)
+{
+   assert(region->srcSubresource.aspectMask == region->dstSubresource.aspectMask);
+   assert(vk_image_subresource_layer_count(&src_image->vk, &region->srcSubresource) ==
+          vk_image_subresource_layer_count(&dst_image->vk, &region->dstSubresource));
+
+   const uint32_t layer_count =
+      vk_image_subresource_layer_count(&dst_image->vk, &region->dstSubresource);
+
+   anv_foreach_image_aspect_bit(aspect_bit, src_image,
+                                region->srcSubresource.aspectMask) {
+      enum isl_aux_usage src_aux_usage =
+         anv_layout_to_aux_usage(cmd_buffer->device->info, src_image,
+                                 (1 << aspect_bit),
+                                 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                 src_image_layout);
+      enum isl_aux_usage dst_aux_usage =
+         anv_layout_to_aux_usage(cmd_buffer->device->info, dst_image,
+                                 (1 << aspect_bit),
+                                 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                 dst_image_layout);
+
+      anv_image_msaa_resolve(cmd_buffer,
+                             src_image, src_aux_usage,
+                             region->srcSubresource.mipLevel,
+                             region->srcSubresource.baseArrayLayer,
+                             dst_image, dst_aux_usage,
+                             region->dstSubresource.mipLevel,
+                             region->dstSubresource.baseArrayLayer,
+                             (1 << aspect_bit),
+                             region->srcOffset.x,
+                             region->srcOffset.y,
+                             region->dstOffset.x,
+                             region->dstOffset.y,
+                             region->extent.width,
+                             region->extent.height,
+                             layer_count, BLORP_FILTER_NONE);
+   }
+}
+
+void anv_CmdResolveImage2(
+    VkCommandBuffer                             commandBuffer,
+    const VkResolveImageInfo2*                  pResolveImageInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_image, src_image, pResolveImageInfo->srcImage);
+   ANV_FROM_HANDLE(anv_image, dst_image, pResolveImageInfo->dstImage);
+
+   for (uint32_t r = 0; r < pResolveImageInfo->regionCount; r++) {
+      resolve_image(cmd_buffer,
+                    src_image, pResolveImageInfo->srcImageLayout,
+                    dst_image, pResolveImageInfo->dstImageLayout,
+                    &pResolveImageInfo->pRegions[r]);
+   }
+}
+
+void
+anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
+                         const struct anv_image *image,
+                         VkImageAspectFlagBits aspect,
+                         uint32_t base_level, uint32_t level_count,
+                         uint32_t base_layer, uint32_t layer_count)
+{
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   /* We don't know who touched the main surface last so flush a bunch of
+    * caches to ensure we get good data.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+                             "before copy_to_shadow");
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                image, aspect,
+                                VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                                VK_IMAGE_LAYOUT_GENERAL,
+                                ISL_AUX_USAGE_NONE, &surf);
+   assert(surf.aux_usage == ISL_AUX_USAGE_NONE);
+
+   struct blorp_surf shadow_surf;
+   get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
+                                       image, aspect, &shadow_surf);
+
+   for (uint32_t l = 0; l < level_count; l++) {
+      const uint32_t level = base_level + l;
+
+      const VkExtent3D extent = vk_image_mip_level_extent(&image->vk, level);
+
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+         layer_count = extent.depth;
+
+      for (uint32_t a = 0; a < layer_count; a++) {
+         const uint32_t layer = base_layer + a;
+
+         blorp_copy(&batch, &surf, level, layer,
+                    &shadow_surf, level, layer,
+                    0, 0, 0, 0, extent.width, extent.height);
+      }
+   }
+
+   /* We just wrote to the buffer with the render cache.  Flush it. */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
+                             "after copy_to_shadow");
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
+                      const struct anv_image *image,
+                      VkImageAspectFlagBits aspect,
+                      enum isl_aux_usage aux_usage,
+                      enum isl_format format, struct isl_swizzle swizzle,
+                      uint32_t level, uint32_t base_layer, uint32_t layer_count,
+                      VkRect2D area, union isl_color_value clear_color)
+{
+   assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+
+   /* We don't support planar images with multisampling yet */
+   assert(image->n_planes == 1);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
+                                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                                ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                aux_usage, &surf);
+   anv_cmd_buffer_mark_image_written(cmd_buffer, image, aspect, aux_usage,
+                                     level, base_layer, layer_count);
+
+   blorp_clear(&batch, &surf, format, anv_swizzle_for_render(swizzle),
+               level, base_layer, layer_count,
+               area.offset.x, area.offset.y,
+               area.offset.x + area.extent.width,
+               area.offset.y + area.extent.height,
+               clear_color, 0 /* color_write_disable */);
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
+                              const struct anv_image *image,
+                              VkImageAspectFlags aspects,
+                              enum isl_aux_usage depth_aux_usage,
+                              uint32_t level,
+                              uint32_t base_layer, uint32_t layer_count,
+                              VkRect2D area,
+                              float depth_value, uint8_t stencil_value)
+{
+   assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                               VK_IMAGE_ASPECT_STENCIL_BIT));
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf depth = {};
+   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   depth_aux_usage, &depth);
+   }
+
+   struct blorp_surf stencil = {};
+   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   image->planes[plane].aux_usage, &stencil);
+   }
+
+   /* Blorp may choose to clear stencil using RGBA32_UINT for better
+    * performance.  If it does this, we need to flush it out of the depth
+    * cache before rendering to it.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "before clear DS");
+
+   blorp_clear_depth_stencil(&batch, &depth, &stencil,
+                             level, base_layer, layer_count,
+                             area.offset.x, area.offset.y,
+                             area.offset.x + area.extent.width,
+                             area.offset.y + area.extent.height,
+                             aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                             depth_value,
+                             (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) ? 0xff : 0,
+                             stencil_value);
+
+   /* Blorp may choose to clear stencil using RGBA32_UINT for better
+    * performance.  If it does this, we need to flush it out of the render
+    * cache before someone starts trying to do stencil on it.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after clear DS");
+
+   struct blorp_surf stencil_shadow;
+   if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
+       get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
+                                           VK_IMAGE_ASPECT_STENCIL_BIT,
+                                           &stencil_shadow)) {
+      union isl_color_value stencil_color = {
+         .u32 = { stencil_value },
+      };
+      blorp_clear(&batch, &stencil_shadow,
+                  ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
+                  level, base_layer, layer_count,
+                  area.offset.x, area.offset.y,
+                  area.offset.x + area.extent.width,
+                  area.offset.y + area.extent.height,
+                  stencil_color, 0 /* color_write_disable */);
+   }
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 VkImageAspectFlagBits aspect, uint32_t level,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op hiz_op)
+{
+   assert(aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
+   assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, level));
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   assert(plane == 0);
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                image->planes[plane].aux_usage, &surf);
+
+   blorp_hiz_op(&batch, &surf, level, base_layer, layer_count, hiz_op);
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
+                    const struct anv_image *image,
+                    VkImageAspectFlags aspects,
+                    uint32_t level,
+                    uint32_t base_layer, uint32_t layer_count,
+                    VkRect2D area, uint8_t stencil_value)
+{
+   assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                               VK_IMAGE_ASPECT_STENCIL_BIT));
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf depth = {};
+   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
+      assert(base_layer + layer_count <=
+             anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level));
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   image->planes[plane].aux_usage, &depth);
+   }
+
+   struct blorp_surf stencil = {};
+   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+      get_blorp_surf_for_anv_image(cmd_buffer->device,
+                                   image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   image->planes[plane].aux_usage, &stencil);
+   }
+
+   /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear":
+    *
+    *    "The following is required when performing a depth buffer clear with
+    *    using the WM_STATE or 3DSTATE_WM:
+    *
+    *       * If other rendering operations have preceded this clear, a
+    *         PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+    *         enabled must be issued before the rectangle primitive used for
+    *         the depth buffer clear operation.
+    *       * [...]"
+    *
+    * Even though the PRM only says that this is required if using 3DSTATE_WM
+    * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional
+    * hangs when doing a clear with WM_HZ_OP.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_DEPTH_STALL_BIT,
+                             "before clear hiz");
+
+   if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+       depth.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) {
+      /* From Bspec 47010 (Depth Buffer Clear):
+       *
+       *    Since the fast clear cycles to CCS are not cached in TileCache,
+       *    any previous depth buffer writes to overlapping pixels must be
+       *    flushed out of TileCache before a succeeding Depth Buffer Clear.
+       *    This restriction only applies to Depth Buffer with write-thru
+       *    enabled, since fast clears to CCS only occur for write-thru mode.
+       *
+       * There may have been a write to this depth buffer. Flush it from the
+       * tile cache just in case.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT,
+                                "before clear hiz_ccs_wt");
+   }
+
+   blorp_hiz_clear_depth_stencil(&batch, &depth, &stencil,
+                                 level, base_layer, layer_count,
+                                 area.offset.x, area.offset.y,
+                                 area.offset.x + area.extent.width,
+                                 area.offset.y + area.extent.height,
+                                 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                 ANV_HZ_FC_VAL,
+                                 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                 stencil_value);
+
+   anv_blorp_batch_finish(&batch);
+
+   /* From the SKL PRM, Depth Buffer Clear:
+    *
+    *    "Depth Buffer Clear Workaround
+    *
+    *    Depth buffer clear pass using any of the methods (WM_STATE,
+    *    3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL
+    *    command with DEPTH_STALL bit and Depth FLUSH bits “set” before
+    *    starting to render.  DepthStall and DepthFlush are not needed between
+    *    consecutive depth clear passes nor is it required if the depth-clear
+    *    pass was done with “full_surf_clear” bit set in the
+    *    3DSTATE_WM_HZ_OP."
+    *
+    * Even though the PRM provides a bunch of conditions under which this is
+    * supposedly unnecessary, we choose to perform the flush unconditionally
+    * just to be safe.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_DEPTH_STALL_BIT,
+                             "after clear hiz");
+}
+
+void
+anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 enum isl_format format, struct isl_swizzle swizzle,
+                 VkImageAspectFlagBits aspect,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op mcs_op, union isl_color_value *clear_value,
+                 bool predicate)
+{
+   assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(image->vk.samples > 1);
+   assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0));
+
+   /* Multisampling with multi-planar formats is not supported */
+   assert(image->n_planes == 1);
+
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        BLORP_BATCH_PREDICATE_ENABLE * predicate +
+                        BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
+                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                ISL_AUX_USAGE_MCS, &surf);
+
+   /* Blorp will store the clear color for us if we provide the clear color
+    * address and we are doing a fast clear. So we save the clear value into
+    * the blorp surface.
+    */
+   if (clear_value)
+      surf.clear_color = *clear_value;
+
+   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+    *
+    *    "After Render target fast clear, pipe-control with color cache
+    *    write-flush must be issued before sending any DRAW commands on
+    *    that render target."
+    *
+    * This comment is a bit cryptic and doesn't really tell you what's going
+    * or what's really needed.  It appears that fast clear ops are not
+    * properly synchronized with other drawing.  This means that we cannot
+    * have a fast clear operation in the pipe at the same time as other
+    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
+    * that the contents of the previous draw hit the render target before we
+    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+    * that it is completed before any additional drawing occurs.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             (devinfo->verx10 == 125 ?
+                                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "before fast clear mcs");
+
+   switch (mcs_op) {
+   case ISL_AUX_OP_FAST_CLEAR:
+      blorp_fast_clear(&batch, &surf, format, swizzle,
+                       0, base_layer, layer_count,
+                       0, 0, image->vk.extent.width, image->vk.extent.height);
+      break;
+   case ISL_AUX_OP_PARTIAL_RESOLVE:
+      blorp_mcs_partial_resolve(&batch, &surf, format,
+                                base_layer, layer_count);
+      break;
+   case ISL_AUX_OP_FULL_RESOLVE:
+   case ISL_AUX_OP_AMBIGUATE:
+   default:
+      unreachable("Unsupported MCS operation");
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after fast clear mcs");
+
+   anv_blorp_batch_finish(&batch);
+}
+
+void
+anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 enum isl_format format, struct isl_swizzle swizzle,
+                 VkImageAspectFlagBits aspect, uint32_t level,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op ccs_op, union isl_color_value *clear_value,
+                 bool predicate)
+{
+   assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+   assert(image->vk.samples == 1);
+   assert(level < anv_image_aux_levels(image, aspect));
+   /* Multi-LOD YcBcR is not allowed */
+   assert(image->n_planes == 1 || level == 0);
+   assert(base_layer + layer_count <=
+          anv_image_aux_layers(image, aspect, level));
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        BLORP_BATCH_PREDICATE_ENABLE * predicate +
+                        BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
+                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                image->planes[plane].aux_usage,
+                                &surf);
+
+   uint32_t level_width = anv_minify(surf.surf->logical_level0_px.w, level);
+   uint32_t level_height = anv_minify(surf.surf->logical_level0_px.h, level);
+
+   /* Blorp will store the clear color for us if we provide the clear color
+    * address and we are doing a fast clear. So we save the clear value into
+    * the blorp surface.
+    */
+   if (clear_value)
+      surf.clear_color = *clear_value;
+
+   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+    *
+    *    "After Render target fast clear, pipe-control with color cache
+    *    write-flush must be issued before sending any DRAW commands on
+    *    that render target."
+    *
+    * This comment is a bit cryptic and doesn't really tell you what's going
+    * or what's really needed.  It appears that fast clear ops are not
+    * properly synchronized with other drawing.  This means that we cannot
+    * have a fast clear operation in the pipe at the same time as other
+    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
+    * that the contents of the previous draw hit the render target before we
+    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+    * that it is completed before any additional drawing occurs.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             (devinfo->verx10 == 125 ?
+                                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "before fast clear ccs");
+
+   switch (ccs_op) {
+   case ISL_AUX_OP_FAST_CLEAR:
+      blorp_fast_clear(&batch, &surf, format, swizzle,
+                       level, base_layer, layer_count,
+                       0, 0, level_width, level_height);
+      break;
+   case ISL_AUX_OP_FULL_RESOLVE:
+   case ISL_AUX_OP_PARTIAL_RESOLVE:
+      blorp_ccs_resolve(&batch, &surf, level, base_layer, layer_count,
+                        format, ccs_op);
+      break;
+   case ISL_AUX_OP_AMBIGUATE:
+      for (uint32_t a = 0; a < layer_count; a++) {
+         const uint32_t layer = base_layer + a;
+         blorp_ccs_ambiguate(&batch, &surf, level, layer);
+      }
+      break;
+   default:
+      unreachable("Unsupported CCS operation");
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after fast clear ccs");
+
+   anv_blorp_batch_finish(&batch);
+}
diff --git a/src/intel/vulkan_hasvk/anv_bo_sync.c b/src/intel/vulkan_hasvk/anv_bo_sync.c
new file mode 100644
index 00000000000..149ae2c2ba2
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_bo_sync.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "util/os_time.h"
+
+static struct anv_bo_sync *
+to_anv_bo_sync(struct vk_sync *sync)
+{
+   assert(sync->type == &anv_bo_sync_type);
+   return container_of(sync, struct anv_bo_sync, sync);
+}
+
+static VkResult
+anv_bo_sync_init(struct vk_device *vk_device,
+                 struct vk_sync *vk_sync,
+                 uint64_t initial_value)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   sync->state = initial_value ? ANV_BO_SYNC_STATE_SIGNALED :
+                                 ANV_BO_SYNC_STATE_RESET;
+
+   return anv_device_alloc_bo(device, "bo-sync", 4096,
+                              ANV_BO_ALLOC_EXTERNAL |
+                              ANV_BO_ALLOC_IMPLICIT_SYNC,
+                              0 /* explicit_address */,
+                              &sync->bo);
+}
+
+static void
+anv_bo_sync_finish(struct vk_device *vk_device,
+                   struct vk_sync *vk_sync)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   anv_device_release_bo(device, sync->bo);
+}
+
+static VkResult
+anv_bo_sync_reset(struct vk_device *vk_device,
+                  struct vk_sync *vk_sync)
+{
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   sync->state = ANV_BO_SYNC_STATE_RESET;
+
+   return VK_SUCCESS;
+}
+
+static int64_t
+anv_get_relative_timeout(uint64_t abs_timeout)
+{
+   uint64_t now = os_time_get_nano();
+
+   /* We don't want negative timeouts.
+    *
+    * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is
+    * supposed to block indefinitely timeouts < 0.  Unfortunately,
+    * this was broken for a couple of kernel releases.  Since there's
+    * no way to know whether or not the kernel we're using is one of
+    * the broken ones, the best we can do is to clamp the timeout to
+    * INT64_MAX.  This limits the maximum timeout from 584 years to
+    * 292 years - likely not a big deal.
+    */
+   if (abs_timeout < now)
+      return 0;
+
+   uint64_t rel_timeout = abs_timeout - now;
+   if (rel_timeout > (uint64_t) INT64_MAX)
+      rel_timeout = INT64_MAX;
+
+   return rel_timeout;
+}
+
+static VkResult
+anv_bo_sync_wait(struct vk_device *vk_device,
+                 uint32_t wait_count,
+                 const struct vk_sync_wait *waits,
+                 enum vk_sync_wait_flags wait_flags,
+                 uint64_t abs_timeout_ns)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   VkResult result;
+
+   uint32_t pending = wait_count;
+   while (pending) {
+      pending = 0;
+      bool signaled = false;
+      for (uint32_t i = 0; i < wait_count; i++) {
+         struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+         switch (sync->state) {
+         case ANV_BO_SYNC_STATE_RESET:
+            /* This fence hasn't been submitted yet, we'll catch it the next
+             * time around.  Yes, this may mean we dead-loop but, short of
+             * lots of locking and a condition variable, there's not much that
+             * we can do about that.
+             */
+            assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
+            pending++;
+            continue;
+
+         case ANV_BO_SYNC_STATE_SIGNALED:
+            /* This fence is not pending.  If waitAll isn't set, we can return
+             * early.  Otherwise, we have to keep going.
+             */
+            if (wait_flags & VK_SYNC_WAIT_ANY)
+               return VK_SUCCESS;
+            continue;
+
+         case ANV_BO_SYNC_STATE_SUBMITTED:
+            /* These are the fences we really care about.  Go ahead and wait
+             * on it until we hit a timeout.
+             */
+            if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
+               uint64_t rel_timeout = anv_get_relative_timeout(abs_timeout_ns);
+               result = anv_device_wait(device, sync->bo, rel_timeout);
+               /* This also covers VK_TIMEOUT */
+               if (result != VK_SUCCESS)
+                  return result;
+
+               sync->state = ANV_BO_SYNC_STATE_SIGNALED;
+               signaled = true;
+            }
+            if (wait_flags & VK_SYNC_WAIT_ANY)
+               return VK_SUCCESS;
+            break;
+
+         default:
+            unreachable("Invalid BO sync state");
+         }
+      }
+
+      if (pending && !signaled) {
+         /* If we've hit this then someone decided to vkWaitForFences before
+          * they've actually submitted any of them to a queue.  This is a
+          * fairly pessimal case, so it's ok to lock here and use a standard
+          * pthreads condition variable.
+          */
+         pthread_mutex_lock(&device->mutex);
+
+         /* It's possible that some of the fences have changed state since the
+          * last time we checked.  Now that we have the lock, check for
+          * pending fences again and don't wait if it's changed.
+          */
+         uint32_t now_pending = 0;
+         for (uint32_t i = 0; i < wait_count; i++) {
+            struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+            if (sync->state == ANV_BO_SYNC_STATE_RESET)
+               now_pending++;
+         }
+         assert(now_pending <= pending);
+
+         if (now_pending == pending) {
+            struct timespec abstime = {
+               .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
+               .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
+            };
+
+            ASSERTED int ret;
+            ret = pthread_cond_timedwait(&device->queue_submit,
+                                         &device->mutex, &abstime);
+            assert(ret != EINVAL);
+            if (os_time_get_nano() >= abs_timeout_ns) {
+               pthread_mutex_unlock(&device->mutex);
+               return VK_TIMEOUT;
+            }
+         }
+
+         pthread_mutex_unlock(&device->mutex);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+const struct vk_sync_type anv_bo_sync_type = {
+   .size = sizeof(struct anv_bo_sync),
+   .features = VK_SYNC_FEATURE_BINARY |
+               VK_SYNC_FEATURE_GPU_WAIT |
+               VK_SYNC_FEATURE_GPU_MULTI_WAIT |
+               VK_SYNC_FEATURE_CPU_WAIT |
+               VK_SYNC_FEATURE_CPU_RESET |
+               VK_SYNC_FEATURE_WAIT_ANY |
+               VK_SYNC_FEATURE_WAIT_PENDING,
+   .init = anv_bo_sync_init,
+   .finish = anv_bo_sync_finish,
+   .reset = anv_bo_sync_reset,
+   .wait_many = anv_bo_sync_wait,
+};
+
+VkResult
+anv_create_sync_for_memory(struct vk_device *device,
+                           VkDeviceMemory memory,
+                           bool signal_memory,
+                           struct vk_sync **sync_out)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, memory);
+   struct anv_bo_sync *bo_sync;
+
+   bo_sync = vk_zalloc(&device->alloc, sizeof(*bo_sync), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (bo_sync == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   bo_sync->sync.type = &anv_bo_sync_type;
+   bo_sync->state = signal_memory ? ANV_BO_SYNC_STATE_RESET :
+                                    ANV_BO_SYNC_STATE_SUBMITTED;
+   bo_sync->bo = anv_bo_ref(mem->bo);
+
+   *sync_out = &bo_sync->sync;
+
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan_hasvk/anv_cmd_buffer.c b/src/intel/vulkan_hasvk/anv_cmd_buffer.c
new file mode 100644
index 00000000000..0950bad52a6
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_cmd_buffer.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+
+#include "vk_util.h"
+
+/** \file anv_cmd_buffer.c
+ *
+ * This file contains all of the stuff for emitting commands into a command
+ * buffer.  This includes implementations of most of the vkCmd*
+ * entrypoints.  This file is concerned entirely with state emission and
+ * not with the command buffer data structure itself.  As far as this file
+ * is concerned, most of anv_cmd_buffer is magic.
+ */
+
+static void
+anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_state *state = &cmd_buffer->state;
+
+   memset(state, 0, sizeof(*state));
+
+   state->current_pipeline = UINT32_MAX;
+   state->gfx.restart_index = UINT32_MAX;
+   state->gfx.dirty = 0;
+}
+
+static void
+anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer,
+                              struct anv_cmd_pipeline_state *pipe_state)
+{
+   for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) {
+      if (pipe_state->push_descriptors[i]) {
+         anv_descriptor_set_layout_unref(cmd_buffer->device,
+             pipe_state->push_descriptors[i]->set.layout);
+         vk_free(&cmd_buffer->vk.pool->alloc, pipe_state->push_descriptors[i]);
+      }
+   }
+}
+
+static void
+anv_cmd_state_finish(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_state *state = &cmd_buffer->state;
+
+   anv_cmd_pipeline_state_finish(cmd_buffer, &state->gfx.base);
+   anv_cmd_pipeline_state_finish(cmd_buffer, &state->compute.base);
+}
+
+static void
+anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_cmd_state_finish(cmd_buffer);
+   anv_cmd_state_init(cmd_buffer);
+}
+
+static void anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer);
+
+static const struct vk_command_buffer_ops cmd_buffer_ops = {
+   .destroy = anv_cmd_buffer_destroy,
+};
+
+static VkResult anv_create_cmd_buffer(
+    struct anv_device *                         device,
+    struct vk_command_pool *                    pool,
+    VkCommandBufferLevel                        level,
+    VkCommandBuffer*                            pCommandBuffer)
+{
+   struct anv_cmd_buffer *cmd_buffer;
+   VkResult result;
+
+   cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (cmd_buffer == NULL)
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = vk_command_buffer_init(pool, &cmd_buffer->vk,
+                                   &cmd_buffer_ops, level);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   cmd_buffer->vk.dynamic_graphics_state.ms.sample_locations =
+      &cmd_buffer->state.gfx.sample_locations;
+
+   cmd_buffer->batch.status = VK_SUCCESS;
+
+   cmd_buffer->device = device;
+
+   assert(pool->queue_family_index < device->physical->queue.family_count);
+   cmd_buffer->queue_family =
+      &device->physical->queue.families[pool->queue_family_index];
+
+   result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer);
+   if (result != VK_SUCCESS)
+      goto fail_vk;
+
+   anv_state_stream_init(&cmd_buffer->surface_state_stream,
+                         &device->surface_state_pool, 4096);
+   anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
+                         &device->dynamic_state_pool, 16384);
+   anv_state_stream_init(&cmd_buffer->general_state_stream,
+                         &device->general_state_pool, 16384);
+
+   cmd_buffer->self_mod_locations = NULL;
+
+   anv_cmd_state_init(cmd_buffer);
+
+   anv_measure_init(cmd_buffer);
+
+   u_trace_init(&cmd_buffer->trace, &device->ds.trace_context);
+
+   *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   return VK_SUCCESS;
+
+ fail_vk:
+   vk_command_buffer_finish(&cmd_buffer->vk);
+ fail_alloc:
+   vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
+
+   return result;
+}
+
+VkResult anv_AllocateCommandBuffers(
+    VkDevice                                    _device,
+    const VkCommandBufferAllocateInfo*          pAllocateInfo,
+    VkCommandBuffer*                            pCommandBuffers)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
+
+   VkResult result = VK_SUCCESS;
+   uint32_t i;
+
+   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
+      result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level,
+                                     &pCommandBuffers[i]);
+      if (result != VK_SUCCESS)
+         break;
+   }
+
+   if (result != VK_SUCCESS) {
+      while (i--) {
+         VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
+         anv_cmd_buffer_destroy(cmd_buffer);
+      }
+      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
+         pCommandBuffers[i] = VK_NULL_HANDLE;
+   }
+
+   return result;
+}
+
+static void
+anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
+{
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+
+   u_trace_fini(&cmd_buffer->trace);
+
+   anv_measure_destroy(cmd_buffer);
+
+   anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
+
+   anv_state_stream_finish(&cmd_buffer->surface_state_stream);
+   anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
+   anv_state_stream_finish(&cmd_buffer->general_state_stream);
+
+   anv_cmd_state_finish(cmd_buffer);
+
+   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->self_mod_locations);
+
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
+}
+
+VkResult
+anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer)
+{
+   vk_command_buffer_reset(&cmd_buffer->vk);
+
+   cmd_buffer->usage_flags = 0;
+   cmd_buffer->perf_query_pool = NULL;
+   anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
+   anv_cmd_state_reset(cmd_buffer);
+
+   anv_state_stream_finish(&cmd_buffer->surface_state_stream);
+   anv_state_stream_init(&cmd_buffer->surface_state_stream,
+                         &cmd_buffer->device->surface_state_pool, 4096);
+
+   anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
+   anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
+                         &cmd_buffer->device->dynamic_state_pool, 16384);
+
+   anv_state_stream_finish(&cmd_buffer->general_state_stream);
+   anv_state_stream_init(&cmd_buffer->general_state_stream,
+                         &cmd_buffer->device->general_state_pool, 16384);
+
+   anv_measure_reset(cmd_buffer);
+
+   u_trace_fini(&cmd_buffer->trace);
+   u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->ds.trace_context);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_ResetCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   return anv_cmd_buffer_reset(cmd_buffer);
+}
+
+void
+anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, cmd_buffer_emit_state_base_address)(cmd_buffer);
+}
+
+void
+anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
+                                  const struct anv_image *image,
+                                  VkImageAspectFlagBits aspect,
+                                  enum isl_aux_usage aux_usage,
+                                  uint32_t level,
+                                  uint32_t base_layer,
+                                  uint32_t layer_count)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, cmd_buffer_mark_image_written)(cmd_buffer, image,
+                                                    aspect, aux_usage,
+                                                    level, base_layer,
+                                                    layer_count);
+}
+
+void
+anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, cmd_emit_conditional_render_predicate)(cmd_buffer);
+}
+
+static bool
+mem_update(void *dst, const void *src, size_t size)
+{
+   if (memcmp(dst, src, size) == 0)
+      return false;
+
+   memcpy(dst, src, size);
+   return true;
+}
+
+static void
+set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
+                       gl_shader_stage stage,
+                       const struct anv_pipeline_bind_map *map)
+{
+   assert(stage < ARRAY_SIZE(cmd_buffer->state.surface_sha1s));
+   if (mem_update(cmd_buffer->state.surface_sha1s[stage],
+                  map->surface_sha1, sizeof(map->surface_sha1)))
+      cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage);
+
+   assert(stage < ARRAY_SIZE(cmd_buffer->state.sampler_sha1s));
+   if (mem_update(cmd_buffer->state.sampler_sha1s[stage],
+                  map->sampler_sha1, sizeof(map->sampler_sha1)))
+      cmd_buffer->state.descriptors_dirty |= mesa_to_vk_shader_stage(stage);
+
+   assert(stage < ARRAY_SIZE(cmd_buffer->state.push_sha1s));
+   if (mem_update(cmd_buffer->state.push_sha1s[stage],
+                  map->push_sha1, sizeof(map->push_sha1)))
+      cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
+}
+
+static inline uint32_t
+ilog2_round_up(uint32_t value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+static void
+anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_cmd_pipeline_state *pipeline_state,
+                                    struct anv_pipeline *pipeline,
+                                    VkShaderStageFlags stages)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   uint64_t ray_shadow_size =
+      align_u64(brw_rt_ray_queries_shadow_stacks_size(device->info,
+                                                      pipeline->ray_queries),
+                4096);
+   if (ray_shadow_size > 0 &&
+       (!cmd_buffer->state.ray_query_shadow_bo ||
+        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+      unsigned shadow_size_log2 = MAX2(ilog2_round_up(ray_shadow_size), 16);
+      unsigned bucket = shadow_size_log2 - 16;
+      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos));
+
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]);
+      if (bo == NULL) {
+         struct anv_bo *new_bo;
+         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+                                               ray_shadow_size,
+                                               0, /* alloc_flags */
+                                               0, /* explicit_address */
+                                               &new_bo);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
+
+         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo);
+         if (bo != NULL) {
+            anv_device_release_bo(device, bo);
+         } else {
+            bo = new_bo;
+         }
+      }
+      cmd_buffer->state.ray_query_shadow_bo = bo;
+
+      /* Add the ray query buffers to the batch list. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                            cmd_buffer->batch.alloc,
+                            cmd_buffer->state.ray_query_shadow_bo);
+   }
+
+   /* Add the HW buffer to the list of BO used. */
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         cmd_buffer->batch.alloc,
+                         device->ray_query_bo);
+
+   /* Fill the push constants & mark them dirty. */
+   struct anv_state ray_query_global_state =
+      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+
+   struct anv_address ray_query_globals_addr = (struct anv_address) {
+      .bo = device->dynamic_state_pool.block_pool.bo,
+      .offset = ray_query_global_state.offset,
+   };
+   pipeline_state->push_constants.ray_query_globals =
+      anv_address_physical(ray_query_globals_addr);
+   cmd_buffer->state.push_constants_dirty |= stages;
+}
+
+void anv_CmdBindPipeline(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipeline                                  _pipeline)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   struct anv_cmd_pipeline_state *state;
+   VkShaderStageFlags stages = 0;
+
+   switch (pipelineBindPoint) {
+   case VK_PIPELINE_BIND_POINT_COMPUTE: {
+      struct anv_compute_pipeline *compute_pipeline =
+         anv_pipeline_to_compute(pipeline);
+      if (cmd_buffer->state.compute.pipeline == compute_pipeline)
+         return;
+
+      cmd_buffer->state.compute.pipeline = compute_pipeline;
+      cmd_buffer->state.compute.pipeline_dirty = true;
+      set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
+                             &compute_pipeline->cs->bind_map);
+
+      state = &cmd_buffer->state.compute.base;
+      stages = VK_SHADER_STAGE_COMPUTE_BIT;
+      break;
+   }
+
+   case VK_PIPELINE_BIND_POINT_GRAPHICS: {
+      struct anv_graphics_pipeline *gfx_pipeline =
+         anv_pipeline_to_graphics(pipeline);
+      if (cmd_buffer->state.gfx.pipeline == gfx_pipeline)
+         return;
+
+      cmd_buffer->state.gfx.pipeline = gfx_pipeline;
+      cmd_buffer->state.gfx.vb_dirty |= gfx_pipeline->vb_used;
+      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+
+      anv_foreach_stage(stage, gfx_pipeline->active_stages) {
+         set_dirty_for_bind_map(cmd_buffer, stage,
+                                &gfx_pipeline->shaders[stage]->bind_map);
+      }
+
+      /* Apply the non dynamic state from the pipeline */
+      vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
+                                        &gfx_pipeline->dynamic_state);
+
+      state = &cmd_buffer->state.gfx.base;
+      stages = gfx_pipeline->active_stages;
+      break;
+   }
+
+   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
+      struct anv_ray_tracing_pipeline *rt_pipeline =
+         anv_pipeline_to_ray_tracing(pipeline);
+      if (cmd_buffer->state.rt.pipeline == rt_pipeline)
+         return;
+
+      cmd_buffer->state.rt.pipeline = rt_pipeline;
+      cmd_buffer->state.rt.pipeline_dirty = true;
+
+      if (rt_pipeline->stack_size > 0) {
+         anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
+                                                  rt_pipeline->stack_size);
+      }
+
+      state = &cmd_buffer->state.rt.base;
+      break;
+   }
+
+   default:
+      unreachable("invalid bind point");
+      break;
+   }
+
+   if (pipeline->ray_queries > 0)
+      anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
+}
+
+static void
+anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+                                   VkPipelineBindPoint bind_point,
+                                   struct anv_pipeline_layout *layout,
+                                   uint32_t set_index,
+                                   struct anv_descriptor_set *set,
+                                   uint32_t *dynamic_offset_count,
+                                   const uint32_t **dynamic_offsets)
+{
+   /* Either we have no pool because it's a push descriptor or the pool is not
+    * host only :
+    *
+    * VUID-vkCmdBindDescriptorSets-pDescriptorSets-04616:
+    *
+    *    "Each element of pDescriptorSets must not have been allocated from a
+    *     VkDescriptorPool with the
+    *     VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_VALVE flag set"
+    */
+   assert(!set->pool || !set->pool->host_only);
+
+   struct anv_descriptor_set_layout *set_layout =
+      layout->set[set_index].layout;
+
+   VkShaderStageFlags stages = set_layout->shader_stages;
+   struct anv_cmd_pipeline_state *pipe_state;
+
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      stages &= VK_SHADER_STAGE_ALL_GRAPHICS |
+                (cmd_buffer->device->vk.enabled_extensions.NV_mesh_shader ?
+                      (VK_SHADER_STAGE_TASK_BIT_NV |
+                       VK_SHADER_STAGE_MESH_BIT_NV) : 0);
+      pipe_state = &cmd_buffer->state.gfx.base;
+      break;
+
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      stages &= VK_SHADER_STAGE_COMPUTE_BIT;
+      pipe_state = &cmd_buffer->state.compute.base;
+      break;
+
+   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+      stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                VK_SHADER_STAGE_MISS_BIT_KHR |
+                VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+      pipe_state = &cmd_buffer->state.rt.base;
+      break;
+
+   default:
+      unreachable("invalid bind point");
+   }
+
+   VkShaderStageFlags dirty_stages = 0;
+   /* If it's a push descriptor set, we have to flag things as dirty
+    * regardless of whether or not the CPU-side data structure changed as we
+    * may have edited in-place.
+    */
+   if (pipe_state->descriptors[set_index] != set ||
+         anv_descriptor_set_is_push(set)) {
+      pipe_state->descriptors[set_index] = set;
+
+      /* Those stages don't have access to HW binding tables.
+       * This means that we have to upload the descriptor set
+       * as an 64-bit address in the push constants.
+       */
+      bool update_desc_sets = stages & (VK_SHADER_STAGE_TASK_BIT_NV |
+                                        VK_SHADER_STAGE_MESH_BIT_NV |
+                                        VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                                        VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                                        VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                                        VK_SHADER_STAGE_MISS_BIT_KHR |
+                                        VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                                        VK_SHADER_STAGE_CALLABLE_BIT_KHR);
+
+      if (update_desc_sets) {
+         struct anv_push_constants *push = &pipe_state->push_constants;
+
+         struct anv_address addr = anv_descriptor_set_address(set);
+         push->desc_sets[set_index] = anv_address_physical(addr);
+
+         if (addr.bo) {
+            anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                                  cmd_buffer->batch.alloc,
+                                  addr.bo);
+         }
+      }
+
+      dirty_stages |= stages;
+   }
+
+   if (dynamic_offsets) {
+      if (set_layout->dynamic_offset_count > 0) {
+         struct anv_push_constants *push = &pipe_state->push_constants;
+         uint32_t dynamic_offset_start =
+            layout->set[set_index].dynamic_offset_start;
+         uint32_t *push_offsets =
+            &push->dynamic_offsets[dynamic_offset_start];
+
+         /* Assert that everything is in range */
+         assert(set_layout->dynamic_offset_count <= *dynamic_offset_count);
+         assert(dynamic_offset_start + set_layout->dynamic_offset_count <=
+                ARRAY_SIZE(push->dynamic_offsets));
+
+         for (uint32_t i = 0; i < set_layout->dynamic_offset_count; i++) {
+            if (push_offsets[i] != (*dynamic_offsets)[i]) {
+               push_offsets[i] = (*dynamic_offsets)[i];
+               /* dynamic_offset_stages[] elements could contain blanket
+                * values like VK_SHADER_STAGE_ALL, so limit this to the
+                * binding point's bits.
+                */
+               dirty_stages |= set_layout->dynamic_offset_stages[i] & stages;
+            }
+         }
+
+         *dynamic_offsets += set_layout->dynamic_offset_count;
+         *dynamic_offset_count -= set_layout->dynamic_offset_count;
+      }
+   }
+
+   cmd_buffer->state.descriptors_dirty |= dirty_stages;
+   cmd_buffer->state.push_constants_dirty |= dirty_stages;
+}
+
+void anv_CmdBindDescriptorSets(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipelineLayout                            _layout,
+    uint32_t                                    firstSet,
+    uint32_t                                    descriptorSetCount,
+    const VkDescriptorSet*                      pDescriptorSets,
+    uint32_t                                    dynamicOffsetCount,
+    const uint32_t*                             pDynamicOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+
+   assert(firstSet + descriptorSetCount <= MAX_SETS);
+
+   for (uint32_t i = 0; i < descriptorSetCount; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
+      anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
+                                         layout, firstSet + i, set,
+                                         &dynamicOffsetCount,
+                                         &pDynamicOffsets);
+   }
+}
+
+void anv_CmdBindVertexBuffers2(
+   VkCommandBuffer                              commandBuffer,
+   uint32_t                                     firstBinding,
+   uint32_t                                     bindingCount,
+   const VkBuffer*                              pBuffers,
+   const VkDeviceSize*                          pOffsets,
+   const VkDeviceSize*                          pSizes,
+   const VkDeviceSize*                          pStrides)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
+
+   /* We have to defer setting up vertex buffer since we need the buffer
+    * stride from the pipeline. */
+
+   assert(firstBinding + bindingCount <= MAX_VBS);
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]);
+
+      if (buffer == NULL) {
+         vb[firstBinding + i] = (struct anv_vertex_binding) {
+            .buffer = NULL,
+         };
+      } else {
+         vb[firstBinding + i] = (struct anv_vertex_binding) {
+            .buffer = buffer,
+            .offset = pOffsets[i],
+            .size = vk_buffer_range(&buffer->vk, pOffsets[i],
+                                    pSizes ? pSizes[i] : VK_WHOLE_SIZE),
+         };
+      }
+      cmd_buffer->state.gfx.vb_dirty |= 1 << (firstBinding + i);
+   }
+
+   if (pStrides != NULL) {
+      vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk, firstBinding,
+                                        bindingCount, pStrides);
+   }
+}
+
+void anv_CmdBindTransformFeedbackBuffersEXT(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstBinding,
+    uint32_t                                    bindingCount,
+    const VkBuffer*                             pBuffers,
+    const VkDeviceSize*                         pOffsets,
+    const VkDeviceSize*                         pSizes)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_xfb_binding *xfb = cmd_buffer->state.xfb_bindings;
+
+   /* We have to defer setting up vertex buffer since we need the buffer
+    * stride from the pipeline. */
+
+   assert(firstBinding + bindingCount <= MAX_XFB_BUFFERS);
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      if (pBuffers[i] == VK_NULL_HANDLE) {
+         xfb[firstBinding + i].buffer = NULL;
+      } else {
+         ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]);
+         xfb[firstBinding + i].buffer = buffer;
+         xfb[firstBinding + i].offset = pOffsets[i];
+         xfb[firstBinding + i].size =
+            vk_buffer_range(&buffer->vk, pOffsets[i],
+                            pSizes ? pSizes[i] : VK_WHOLE_SIZE);
+      }
+   }
+}
+
+enum isl_format
+anv_isl_format_for_descriptor_type(const struct anv_device *device,
+                                   VkDescriptorType type)
+{
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      return device->physical->compiler->indirect_ubos_use_sampler ?
+             ISL_FORMAT_R32G32B32A32_FLOAT : ISL_FORMAT_RAW;
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      return ISL_FORMAT_RAW;
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+}
+
+struct anv_state
+anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                            const void *data, uint32_t size, uint32_t alignment)
+{
+   struct anv_state state;
+
+   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment);
+   memcpy(state.map, data, size);
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(state.map, size));
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                             uint32_t *a, uint32_t *b,
+                             uint32_t dwords, uint32_t alignment)
+{
+   struct anv_state state;
+   uint32_t *p;
+
+   state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                              dwords * 4, alignment);
+   p = state.map;
+   for (uint32_t i = 0; i < dwords; i++)
+      p[i] = a[i] | b[i];
+
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(p, dwords * 4));
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_push_constants *data =
+      &cmd_buffer->state.gfx.base.push_constants;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         sizeof(struct anv_push_constants),
+                                         32 /* bottom 5 bits MBZ */);
+   memcpy(state.map, data, sizeof(struct anv_push_constants));
+
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   struct anv_push_constants *data =
+      &cmd_buffer->state.compute.base.push_constants;
+   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
+   const struct anv_push_range *range = &pipeline->cs->bind_map.push_ranges[0];
+
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
+   const unsigned total_push_constants_size =
+      brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
+   if (total_push_constants_size == 0)
+      return (struct anv_state) { .offset = 0 };
+
+   const unsigned push_constant_alignment =
+      cmd_buffer->device->info->ver < 8 ? 32 : 64;
+   const unsigned aligned_total_push_constants_size =
+      ALIGN(total_push_constants_size, push_constant_alignment);
+   struct anv_state state;
+   if (devinfo->verx10 >= 125) {
+      state = anv_state_stream_alloc(&cmd_buffer->general_state_stream,
+                                     aligned_total_push_constants_size,
+                                     push_constant_alignment);
+   } else {
+      state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                                 aligned_total_push_constants_size,
+                                                 push_constant_alignment);
+   }
+
+   void *dst = state.map;
+   const void *src = (char *)data + (range->start * 32);
+
+   if (cs_prog_data->push.cross_thread.size > 0) {
+      memcpy(dst, src, cs_prog_data->push.cross_thread.size);
+      dst += cs_prog_data->push.cross_thread.size;
+      src += cs_prog_data->push.cross_thread.size;
+   }
+
+   if (cs_prog_data->push.per_thread.size > 0) {
+      for (unsigned t = 0; t < dispatch.threads; t++) {
+         memcpy(dst, src, cs_prog_data->push.per_thread.size);
+
+         uint32_t *subgroup_id = dst +
+            offsetof(struct anv_push_constants, cs.subgroup_id) -
+            (range->start * 32 + cs_prog_data->push.cross_thread.size);
+         *subgroup_id = t;
+
+         dst += cs_prog_data->push.per_thread.size;
+      }
+   }
+
+   return state;
+}
+
+void anv_CmdPushConstants(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineLayout                            layout,
+    VkShaderStageFlags                          stageFlags,
+    uint32_t                                    offset,
+    uint32_t                                    size,
+    const void*                                 pValues)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (stageFlags & (VK_SHADER_STAGE_ALL_GRAPHICS |
+                     VK_SHADER_STAGE_TASK_BIT_NV |
+                     VK_SHADER_STAGE_MESH_BIT_NV)) {
+      struct anv_cmd_pipeline_state *pipe_state =
+         &cmd_buffer->state.gfx.base;
+
+      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+   }
+   if (stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      struct anv_cmd_pipeline_state *pipe_state =
+         &cmd_buffer->state.compute.base;
+
+      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+   }
+   if (stageFlags & (VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                     VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                     VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                     VK_SHADER_STAGE_MISS_BIT_KHR |
+                     VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                     VK_SHADER_STAGE_CALLABLE_BIT_KHR)) {
+      struct anv_cmd_pipeline_state *pipe_state =
+         &cmd_buffer->state.rt.base;
+
+      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+   }
+
+   cmd_buffer->state.push_constants_dirty |= stageFlags;
+}
+
+static struct anv_descriptor_set *
+anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+                                   VkPipelineBindPoint bind_point,
+                                   struct anv_descriptor_set_layout *layout,
+                                   uint32_t _set)
+{
+   struct anv_cmd_pipeline_state *pipe_state;
+
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      pipe_state = &cmd_buffer->state.gfx.base;
+      break;
+
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      pipe_state = &cmd_buffer->state.compute.base;
+      break;
+
+   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+      pipe_state = &cmd_buffer->state.rt.base;
+      break;
+
+   default:
+      unreachable("invalid bind point");
+   }
+
+   struct anv_push_descriptor_set **push_set =
+      &pipe_state->push_descriptors[_set];
+
+   if (*push_set == NULL) {
+      *push_set = vk_zalloc(&cmd_buffer->vk.pool->alloc,
+                            sizeof(struct anv_push_descriptor_set), 8,
+                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (*push_set == NULL) {
+         anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return NULL;
+      }
+   }
+
+   struct anv_descriptor_set *set = &(*push_set)->set;
+
+   if (set->layout != layout) {
+      if (set->layout)
+         anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
+      anv_descriptor_set_layout_ref(layout);
+      set->layout = layout;
+   }
+   set->size = anv_descriptor_set_layout_size(layout, 0);
+   set->buffer_view_count = layout->buffer_view_count;
+   set->descriptor_count = layout->descriptor_count;
+   set->buffer_views = (*push_set)->buffer_views;
+
+   if (layout->descriptor_buffer_size &&
+       ((*push_set)->set_used_on_gpu ||
+        set->desc_mem.alloc_size < layout->descriptor_buffer_size)) {
+      /* The previous buffer is either actively used by some GPU command (so
+       * we can't modify it) or is too small.  Allocate a new one.
+       */
+      struct anv_state desc_mem =
+         anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+                                anv_descriptor_set_layout_descriptor_buffer_size(layout, 0),
+                                ANV_UBO_ALIGNMENT);
+      if (set->desc_mem.alloc_size) {
+         /* TODO: Do we really need to copy all the time? */
+         memcpy(desc_mem.map, set->desc_mem.map,
+                MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size));
+      }
+      set->desc_mem = desc_mem;
+
+      set->desc_addr = (struct anv_address) {
+         .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo,
+         .offset = set->desc_mem.offset,
+      };
+
+      enum isl_format format =
+         anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+
+      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+      set->desc_surface_state =
+         anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
+                                isl_dev->ss.size, isl_dev->ss.align);
+      anv_fill_buffer_surface_state(cmd_buffer->device,
+                                    set->desc_surface_state,
+                                    format, ISL_SWIZZLE_IDENTITY,
+                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                    set->desc_addr,
+                                    layout->descriptor_buffer_size, 1);
+   }
+
+   return set;
+}
+
+void anv_CmdPushDescriptorSetKHR(
+    VkCommandBuffer commandBuffer,
+    VkPipelineBindPoint pipelineBindPoint,
+    VkPipelineLayout _layout,
+    uint32_t _set,
+    uint32_t descriptorWriteCount,
+    const VkWriteDescriptorSet* pDescriptorWrites)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+
+   assert(_set < MAX_SETS);
+
+   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
+
+   struct anv_descriptor_set *set =
+      anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint,
+                                         set_layout, _set);
+   if (!set)
+      return;
+
+   /* Go through the user supplied descriptors. */
+   for (uint32_t i = 0; i < descriptorWriteCount; i++) {
+      const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            anv_descriptor_set_write_image_view(cmd_buffer->device, set,
+                                                write->pImageInfo + j,
+                                                write->descriptorType,
+                                                write->dstBinding,
+                                                write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_buffer_view, bview,
+                            write->pTexelBufferView[j]);
+
+            anv_descriptor_set_write_buffer_view(cmd_buffer->device, set,
+                                                 write->descriptorType,
+                                                 bview,
+                                                 write->dstBinding,
+                                                 write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
+
+            anv_descriptor_set_write_buffer(cmd_buffer->device, set,
+                                            &cmd_buffer->surface_state_stream,
+                                            write->descriptorType,
+                                            buffer,
+                                            write->dstBinding,
+                                            write->dstArrayElement + j,
+                                            write->pBufferInfo[j].offset,
+                                            write->pBufferInfo[j].range);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+         const VkWriteDescriptorSetAccelerationStructureKHR *accel_write =
+            vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR);
+         assert(accel_write->accelerationStructureCount ==
+                write->descriptorCount);
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_acceleration_structure, accel,
+                            accel_write->pAccelerationStructures[j]);
+            anv_descriptor_set_write_acceleration_structure(cmd_buffer->device,
+                                                            set, accel,
+                                                            write->dstBinding,
+                                                            write->dstArrayElement + j);
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
+                                      layout, _set, set, NULL, NULL);
+}
+
+void anv_CmdPushDescriptorSetWithTemplateKHR(
+    VkCommandBuffer                             commandBuffer,
+    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
+    VkPipelineLayout                            _layout,
+    uint32_t                                    _set,
+    const void*                                 pData)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
+                   descriptorUpdateTemplate);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+
+   assert(_set < MAX_PUSH_DESCRIPTORS);
+
+   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
+
+   struct anv_descriptor_set *set =
+      anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point,
+                                         set_layout, _set);
+   if (!set)
+      return;
+
+   anv_descriptor_set_write_template(cmd_buffer->device, set,
+                                     &cmd_buffer->surface_state_stream,
+                                     template,
+                                     pData);
+
+   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point,
+                                      layout, _set, set, NULL, NULL);
+}
+
+void anv_CmdSetDeviceMask(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    deviceMask)
+{
+   /* No-op */
+}
+
+void anv_CmdSetRayTracingPipelineStackSizeKHR(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    pipelineStackSize)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+   struct anv_device *device = cmd_buffer->device;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   uint32_t stack_ids_per_dss = 2048; /* TODO */
+
+   unsigned stack_size_log2 = ilog2_round_up(pipelineStackSize);
+   if (stack_size_log2 < 10)
+      stack_size_log2 = 10;
+
+   if (rt->scratch.layout.total_size == 1 << stack_size_log2)
+      return;
+
+   brw_rt_compute_scratch_layout(&rt->scratch.layout, device->info,
+                                 stack_ids_per_dss, 1 << stack_size_log2);
+
+   unsigned bucket = stack_size_log2 - 10;
+   assert(bucket < ARRAY_SIZE(device->rt_scratch_bos));
+
+   struct anv_bo *bo = p_atomic_read(&device->rt_scratch_bos[bucket]);
+   if (bo == NULL) {
+      struct anv_bo *new_bo;
+      VkResult result = anv_device_alloc_bo(device, "RT scratch",
+                                            rt->scratch.layout.total_size,
+                                            0, /* alloc_flags */
+                                            0, /* explicit_address */
+                                            &new_bo);
+      if (result != VK_SUCCESS) {
+         rt->scratch.layout.total_size = 0;
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
+
+      bo = p_atomic_cmpxchg(&device->rt_scratch_bos[bucket], NULL, new_bo);
+      if (bo != NULL) {
+         anv_device_release_bo(device, bo);
+      } else {
+         bo = new_bo;
+      }
+   }
+
+   rt->scratch.bo = bo;
+}
diff --git a/src/intel/vulkan_hasvk/anv_descriptor_set.c b/src/intel/vulkan_hasvk/anv_descriptor_set.c
new file mode 100644
index 00000000000..c8fe93a9fbd
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_descriptor_set.c
@@ -0,0 +1,2046 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "util/mesa-sha1.h"
+#include "vk_util.h"
+
+#include "anv_private.h"
+
+/*
+ * Descriptor set layouts.
+ */
+
+static enum anv_descriptor_data
+anv_descriptor_data_for_type(const struct anv_physical_device *device,
+                             VkDescriptorType type)
+{
+   enum anv_descriptor_data data = 0;
+
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      data = ANV_DESCRIPTOR_SAMPLER_STATE;
+      if (device->has_bindless_samplers)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      data = ANV_DESCRIPTOR_SURFACE_STATE |
+             ANV_DESCRIPTOR_SAMPLER_STATE;
+      if (device->has_bindless_images || device->has_bindless_samplers)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      if (device->has_bindless_images)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      if (device->info.ver < 9)
+         data |= ANV_DESCRIPTOR_IMAGE_PARAM;
+      if (device->has_bindless_images)
+         data |= ANV_DESCRIPTOR_STORAGE_IMAGE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      data = ANV_DESCRIPTOR_SURFACE_STATE |
+             ANV_DESCRIPTOR_BUFFER_VIEW;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+      data = ANV_DESCRIPTOR_INLINE_UNIFORM;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+      data = ANV_DESCRIPTOR_ADDRESS_RANGE;
+      break;
+
+   default:
+      unreachable("Unsupported descriptor type");
+   }
+
+   /* On gfx8 and above when we have softpin enabled, we also need to push
+    * SSBO address ranges so that we can use A64 messages in the shader.
+    */
+   if (device->has_a64_buffer_access &&
+       (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+        type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC))
+      data |= ANV_DESCRIPTOR_ADDRESS_RANGE;
+
+   /* On Ivy Bridge and Bay Trail, we need swizzles textures in the shader
+    * Do not handle VK_DESCRIPTOR_TYPE_STORAGE_IMAGE and
+    * VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT because they already must
+    * have identity swizzle.
+    *
+    * TODO: We need to handle swizzle on buffer views too for those same
+    *       platforms.
+    */
+   if (device->info.verx10 == 70 &&
+       (type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
+        type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER))
+      data |= ANV_DESCRIPTOR_TEXTURE_SWIZZLE;
+
+   return data;
+}
+
+static enum anv_descriptor_data
+anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device,
+                                     const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info,
+                                     int binding)
+{
+   enum anv_descriptor_data desc_data = 0;
+
+   if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0) {
+      for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+         if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+            continue;
+
+         desc_data |= anv_descriptor_data_for_type(device, i);
+      }
+
+      desc_data |= anv_descriptor_data_for_type(
+         device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+
+      return desc_data;
+   }
+
+   const VkMutableDescriptorTypeListVALVE *type_list =
+      &mutable_info->pMutableDescriptorTypeLists[binding];
+   for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+      desc_data |=
+         anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]);
+   }
+
+   return desc_data;
+}
+
+static unsigned
+anv_descriptor_data_size(enum anv_descriptor_data data)
+{
+   unsigned size = 0;
+
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE)
+      size += sizeof(struct anv_sampled_image_descriptor);
+
+   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE)
+      size += sizeof(struct anv_storage_image_descriptor);
+
+   if (data & ANV_DESCRIPTOR_IMAGE_PARAM)
+      size += BRW_IMAGE_PARAM_SIZE * 4;
+
+   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE)
+      size += sizeof(struct anv_address_range_descriptor);
+
+   if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE)
+      size += sizeof(struct anv_texture_swizzle_descriptor);
+
+   return size;
+}
+
+static bool
+anv_needs_descriptor_buffer(VkDescriptorType desc_type,
+                            enum anv_descriptor_data desc_data)
+{
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+       anv_descriptor_data_size(desc_data) > 0)
+      return true;
+   return false;
+}
+
+/** Returns the size in bytes of each descriptor with the given layout */
+static unsigned
+anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout)
+{
+   if (layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
+      assert(layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM);
+      return layout->array_size;
+   }
+
+   unsigned size = anv_descriptor_data_size(layout->data);
+
+   /* For multi-planar bindings, we make every descriptor consume the maximum
+    * number of planes so we don't have to bother with walking arrays and
+    * adding things up every time.  Fortunately, YCbCr samplers aren't all
+    * that common and likely won't be in the middle of big arrays.
+    */
+   if (layout->max_plane_count > 1)
+      size *= layout->max_plane_count;
+
+   return size;
+}
+
+/** Returns size in bytes of the biggest descriptor in the given layout */
+static unsigned
+anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device,
+                                     const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info,
+                                     int binding)
+{
+   unsigned size = 0;
+
+   if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0) {
+      for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+
+         if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+            continue;
+
+         enum anv_descriptor_data desc_data =
+            anv_descriptor_data_for_type(device, i);
+         size = MAX2(size, anv_descriptor_data_size(desc_data));
+      }
+
+      enum anv_descriptor_data desc_data = anv_descriptor_data_for_type(
+         device, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+      size = MAX2(size, anv_descriptor_data_size(desc_data));
+
+      return size;
+   }
+
+   const VkMutableDescriptorTypeListVALVE *type_list =
+      &mutable_info->pMutableDescriptorTypeLists[binding];
+   for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+      enum anv_descriptor_data desc_data =
+         anv_descriptor_data_for_type(device, type_list->pDescriptorTypes[i]);
+      size = MAX2(size, anv_descriptor_data_size(desc_data));
+   }
+
+   return size;
+}
+
+static bool
+anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice,
+                                      enum anv_descriptor_data data,
+                                      bool sampler)
+{
+   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
+      assert(pdevice->has_a64_buffer_access);
+      return true;
+   }
+
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers);
+      return sampler ? pdevice->has_bindless_samplers :
+                       pdevice->has_bindless_images;
+   }
+
+   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
+      assert(pdevice->has_bindless_images);
+      return true;
+   }
+
+   return false;
+}
+
+bool
+anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
+                                 const struct anv_descriptor_set_binding_layout *binding,
+                                 bool sampler)
+{
+   return anv_descriptor_data_supports_bindless(pdevice, binding->data,
+                                                sampler);
+}
+
+bool
+anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
+                                 const struct anv_descriptor_set_binding_layout *binding,
+                                 bool sampler)
+{
+   if (pdevice->always_use_bindless)
+      return anv_descriptor_supports_bindless(pdevice, binding, sampler);
+
+   static const VkDescriptorBindingFlagBits flags_requiring_bindless =
+      VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+      VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+      VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT;
+
+   return (binding->flags & flags_requiring_bindless) != 0;
+}
+
+void anv_GetDescriptorSetLayoutSupport(
+    VkDevice                                    _device,
+    const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
+    VkDescriptorSetLayoutSupport*               pSupport)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct anv_physical_device *pdevice = device->physical;
+
+   uint32_t surface_count[MESA_VULKAN_SHADER_STAGES] = { 0, };
+   VkDescriptorType varying_desc_type = VK_DESCRIPTOR_TYPE_MAX_ENUM;
+   bool needs_descriptor_buffer = false;
+
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE);
+
+   for (uint32_t b = 0; b < pCreateInfo->bindingCount; b++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[b];
+
+      VkDescriptorBindingFlags flags = 0;
+      if (binding_flags_info && binding_flags_info->bindingCount > 0) {
+         assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount);
+         flags = binding_flags_info->pBindingFlags[b];
+      }
+
+      enum anv_descriptor_data desc_data =
+         binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+         anv_descriptor_data_for_mutable_type(pdevice, mutable_info, b) :
+         anv_descriptor_data_for_type(pdevice, binding->descriptorType);
+
+      if (anv_needs_descriptor_buffer(binding->descriptorType, desc_data))
+         needs_descriptor_buffer = true;
+
+      if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)
+         varying_desc_type = binding->descriptorType;
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+         /* There is no real limit on samplers */
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+         /* Inline uniforms don't use a binding */
+         break;
+
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+            break;
+
+         if (binding->pImmutableSamplers) {
+            for (uint32_t i = 0; i < binding->descriptorCount; i++) {
+               ANV_FROM_HANDLE(anv_sampler, sampler,
+                               binding->pImmutableSamplers[i]);
+               anv_foreach_stage(s, binding->stageFlags)
+                  surface_count[s] += sampler->n_planes;
+            }
+         } else {
+            anv_foreach_stage(s, binding->stageFlags)
+               surface_count[s] += binding->descriptorCount;
+         }
+         break;
+
+      default:
+         if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+            break;
+
+         anv_foreach_stage(s, binding->stageFlags)
+            surface_count[s] += binding->descriptorCount;
+         break;
+      }
+   }
+
+   for (unsigned s = 0; s < ARRAY_SIZE(surface_count); s++) {
+      if (needs_descriptor_buffer)
+         surface_count[s] += 1;
+   }
+
+   VkDescriptorSetVariableDescriptorCountLayoutSupport *vdcls =
+      vk_find_struct(pSupport->pNext,
+                     DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT);
+   if (vdcls != NULL) {
+      if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         vdcls->maxVariableDescriptorCount = MAX_INLINE_UNIFORM_BLOCK_SIZE;
+      } else if (varying_desc_type != VK_DESCRIPTOR_TYPE_MAX_ENUM) {
+         vdcls->maxVariableDescriptorCount = UINT16_MAX;
+      } else {
+         vdcls->maxVariableDescriptorCount = 0;
+      }
+   }
+
+   bool supported = true;
+   for (unsigned s = 0; s < ARRAY_SIZE(surface_count); s++) {
+      /* Our maximum binding table size is 240 and we need to reserve 8 for
+       * render targets.
+       */
+      if (surface_count[s] > MAX_BINDING_TABLE_SIZE - MAX_RTS)
+         supported = false;
+   }
+
+   pSupport->supported = supported;
+}
+
+VkResult anv_CreateDescriptorSetLayout(
+    VkDevice                                    _device,
+    const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorSetLayout*                      pSetLayout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO);
+
+   uint32_t num_bindings = 0;
+   uint32_t immutable_sampler_count = 0;
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      num_bindings = MAX2(num_bindings, pCreateInfo->pBindings[j].binding + 1);
+
+      /* From the Vulkan 1.1.97 spec for VkDescriptorSetLayoutBinding:
+       *
+       *    "If descriptorType specifies a VK_DESCRIPTOR_TYPE_SAMPLER or
+       *    VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER type descriptor, then
+       *    pImmutableSamplers can be used to initialize a set of immutable
+       *    samplers. [...]  If descriptorType is not one of these descriptor
+       *    types, then pImmutableSamplers is ignored.
+       *
+       * We need to be careful here and only parse pImmutableSamplers if we
+       * have one of the right descriptor types.
+       */
+      VkDescriptorType desc_type = pCreateInfo->pBindings[j].descriptorType;
+      if ((desc_type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+           desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) &&
+          pCreateInfo->pBindings[j].pImmutableSamplers)
+         immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
+   }
+
+   /* We need to allocate descriptor set layouts off the device allocator
+    * with DEVICE scope because they are reference counted and may not be
+    * destroyed when vkDestroyDescriptorSetLayout is called.
+    */
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct anv_descriptor_set_layout, set_layout, 1);
+   VK_MULTIALLOC_DECL(&ma, struct anv_descriptor_set_binding_layout,
+                           bindings, num_bindings);
+   VK_MULTIALLOC_DECL(&ma, struct anv_sampler *, samplers,
+                           immutable_sampler_count);
+
+   if (!vk_object_multizalloc(&device->vk, &ma, NULL,
+                              VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   set_layout->ref_cnt = 1;
+   set_layout->binding_count = num_bindings;
+
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      /* Initialize all binding_layout entries to -1 */
+      memset(&set_layout->binding[b], -1, sizeof(set_layout->binding[b]));
+
+      set_layout->binding[b].flags = 0;
+      set_layout->binding[b].data = 0;
+      set_layout->binding[b].max_plane_count = 0;
+      set_layout->binding[b].array_size = 0;
+      set_layout->binding[b].immutable_samplers = NULL;
+   }
+
+   /* Initialize all samplers to 0 */
+   memset(samplers, 0, immutable_sampler_count * sizeof(*samplers));
+
+   uint32_t buffer_view_count = 0;
+   uint32_t dynamic_offset_count = 0;
+   uint32_t descriptor_buffer_size = 0;
+
+   for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
+      const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
+      uint32_t b = binding->binding;
+      /* We temporarily store pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  This provides us with a quick-and-dirty
+       * way to sort the bindings by binding number.
+       */
+      set_layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1);
+   }
+
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+
+   const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE);
+
+   for (uint32_t b = 0; b < num_bindings; b++) {
+      /* We stashed the pCreateInfo->pBindings[] index (plus one) in the
+       * immutable_samplers pointer.  Check for NULL (empty binding) and then
+       * reset it and compute the index.
+       */
+      if (set_layout->binding[b].immutable_samplers == NULL)
+         continue;
+      const uint32_t info_idx =
+         (uintptr_t)(void *)set_layout->binding[b].immutable_samplers - 1;
+      set_layout->binding[b].immutable_samplers = NULL;
+
+      const VkDescriptorSetLayoutBinding *binding =
+         &pCreateInfo->pBindings[info_idx];
+
+      if (binding->descriptorCount == 0)
+         continue;
+
+      set_layout->binding[b].type = binding->descriptorType;
+
+      if (binding_flags_info && binding_flags_info->bindingCount > 0) {
+         assert(binding_flags_info->bindingCount == pCreateInfo->bindingCount);
+         set_layout->binding[b].flags =
+            binding_flags_info->pBindingFlags[info_idx];
+
+         /* From the Vulkan spec:
+          *
+          *    "If VkDescriptorSetLayoutCreateInfo::flags includes
+          *    VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, then
+          *    all elements of pBindingFlags must not include
+          *    VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT,
+          *    VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT, or
+          *    VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT"
+          */
+         if (pCreateInfo->flags &
+             VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) {
+            assert(!(set_layout->binding[b].flags &
+               (VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+                VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+                VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)));
+         }
+      }
+
+      set_layout->binding[b].data =
+         binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+         anv_descriptor_data_for_mutable_type(device->physical, mutable_info, b) :
+         anv_descriptor_data_for_type(device->physical, binding->descriptorType);
+
+      set_layout->binding[b].array_size = binding->descriptorCount;
+      set_layout->binding[b].descriptor_index = set_layout->descriptor_count;
+      set_layout->descriptor_count += binding->descriptorCount;
+
+      if (set_layout->binding[b].data & ANV_DESCRIPTOR_BUFFER_VIEW) {
+         set_layout->binding[b].buffer_view_index = buffer_view_count;
+         buffer_view_count += binding->descriptorCount;
+      }
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
+         set_layout->binding[b].max_plane_count = 1;
+         if (binding->pImmutableSamplers) {
+            set_layout->binding[b].immutable_samplers = samplers;
+            samplers += binding->descriptorCount;
+
+            for (uint32_t i = 0; i < binding->descriptorCount; i++) {
+               ANV_FROM_HANDLE(anv_sampler, sampler,
+                               binding->pImmutableSamplers[i]);
+
+               set_layout->binding[b].immutable_samplers[i] = sampler;
+               if (set_layout->binding[b].max_plane_count < sampler->n_planes)
+                  set_layout->binding[b].max_plane_count = sampler->n_planes;
+            }
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+         set_layout->binding[b].max_plane_count = 1;
+         break;
+
+      default:
+         break;
+      }
+
+      switch (binding->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         set_layout->binding[b].dynamic_offset_index = dynamic_offset_count;
+         set_layout->dynamic_offset_stages[dynamic_offset_count] = binding->stageFlags;
+         dynamic_offset_count += binding->descriptorCount;
+         assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS);
+         break;
+
+      default:
+         break;
+      }
+
+      set_layout->binding[b].descriptor_stride =
+         binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+         anv_descriptor_size_for_mutable_type(device->physical, mutable_info, b) :
+         anv_descriptor_size(&set_layout->binding[b]);
+
+      if (binding->descriptorType ==
+          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* Inline uniform blocks are specified to use the descriptor array
+          * size as the size in bytes of the block.
+          */
+         descriptor_buffer_size = align_u32(descriptor_buffer_size,
+                                            ANV_UBO_ALIGNMENT);
+         set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
+         descriptor_buffer_size += binding->descriptorCount;
+      } else {
+         set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
+         descriptor_buffer_size +=
+            set_layout->binding[b].descriptor_stride * binding->descriptorCount;
+      }
+
+      set_layout->shader_stages |= binding->stageFlags;
+   }
+
+   set_layout->buffer_view_count = buffer_view_count;
+   set_layout->dynamic_offset_count = dynamic_offset_count;
+   set_layout->descriptor_buffer_size = descriptor_buffer_size;
+
+   *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_descriptor_set_layout_destroy(struct anv_device *device,
+                                  struct anv_descriptor_set_layout *layout)
+{
+   assert(layout->ref_cnt == 0);
+   vk_object_free(&device->vk, NULL, layout);
+}
+
+static const struct anv_descriptor_set_binding_layout *
+set_layout_dynamic_binding(const struct anv_descriptor_set_layout *set_layout)
+{
+   if (set_layout->binding_count == 0)
+      return NULL;
+
+   const struct anv_descriptor_set_binding_layout *last_binding =
+      &set_layout->binding[set_layout->binding_count - 1];
+   if (!(last_binding->flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT))
+      return NULL;
+
+   return last_binding;
+}
+
+static uint32_t
+set_layout_descriptor_count(const struct anv_descriptor_set_layout *set_layout,
+                            uint32_t var_desc_count)
+{
+   const struct anv_descriptor_set_binding_layout *dynamic_binding =
+      set_layout_dynamic_binding(set_layout);
+   if (dynamic_binding == NULL)
+      return set_layout->descriptor_count;
+
+   assert(var_desc_count <= dynamic_binding->array_size);
+   uint32_t shrink = dynamic_binding->array_size - var_desc_count;
+
+   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+      return set_layout->descriptor_count;
+
+   return set_layout->descriptor_count - shrink;
+}
+
+static uint32_t
+set_layout_buffer_view_count(const struct anv_descriptor_set_layout *set_layout,
+                             uint32_t var_desc_count)
+{
+   const struct anv_descriptor_set_binding_layout *dynamic_binding =
+      set_layout_dynamic_binding(set_layout);
+   if (dynamic_binding == NULL)
+      return set_layout->buffer_view_count;
+
+   assert(var_desc_count <= dynamic_binding->array_size);
+   uint32_t shrink = dynamic_binding->array_size - var_desc_count;
+
+   if (!(dynamic_binding->data & ANV_DESCRIPTOR_BUFFER_VIEW))
+      return set_layout->buffer_view_count;
+
+   return set_layout->buffer_view_count - shrink;
+}
+
+uint32_t
+anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
+                                                 uint32_t var_desc_count)
+{
+   const struct anv_descriptor_set_binding_layout *dynamic_binding =
+      set_layout_dynamic_binding(set_layout);
+   if (dynamic_binding == NULL)
+      return ALIGN(set_layout->descriptor_buffer_size, ANV_UBO_ALIGNMENT);
+
+   assert(var_desc_count <= dynamic_binding->array_size);
+   uint32_t shrink = dynamic_binding->array_size - var_desc_count;
+   uint32_t set_size;
+
+   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+      /* Inline uniform blocks are specified to use the descriptor array
+       * size as the size in bytes of the block.
+       */
+      set_size = set_layout->descriptor_buffer_size - shrink;
+   } else {
+      set_size = set_layout->descriptor_buffer_size -
+                 shrink * dynamic_binding->descriptor_stride;
+   }
+
+   return ALIGN(set_size, ANV_UBO_ALIGNMENT);
+}
+
+void anv_DestroyDescriptorSetLayout(
+    VkDevice                                    _device,
+    VkDescriptorSetLayout                       _set_layout,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, _set_layout);
+
+   if (!set_layout)
+      return;
+
+   anv_descriptor_set_layout_unref(device, set_layout);
+}
+
+#define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
+
+static void
+sha1_update_immutable_sampler(struct mesa_sha1 *ctx,
+                              const struct anv_sampler *sampler)
+{
+   if (!sampler->conversion)
+      return;
+
+   /* The only thing that affects the shader is ycbcr conversion */
+   _mesa_sha1_update(ctx, sampler->conversion,
+                     sizeof(*sampler->conversion));
+}
+
+static void
+sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
+   const struct anv_descriptor_set_binding_layout *layout)
+{
+   SHA1_UPDATE_VALUE(ctx, layout->flags);
+   SHA1_UPDATE_VALUE(ctx, layout->data);
+   SHA1_UPDATE_VALUE(ctx, layout->max_plane_count);
+   SHA1_UPDATE_VALUE(ctx, layout->array_size);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_index);
+   SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index);
+   SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset);
+
+   if (layout->immutable_samplers) {
+      for (uint16_t i = 0; i < layout->array_size; i++)
+         sha1_update_immutable_sampler(ctx, layout->immutable_samplers[i]);
+   }
+}
+
+static void
+sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
+                                  const struct anv_descriptor_set_layout *layout)
+{
+   SHA1_UPDATE_VALUE(ctx, layout->binding_count);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_count);
+   SHA1_UPDATE_VALUE(ctx, layout->shader_stages);
+   SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count);
+   SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size);
+
+   for (uint16_t i = 0; i < layout->binding_count; i++)
+      sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]);
+}
+
+/*
+ * Pipeline layouts.  These have nothing to do with the pipeline.  They are
+ * just multiple descriptor set layouts pasted together
+ */
+
+VkResult anv_CreatePipelineLayout(
+    VkDevice                                    _device,
+    const VkPipelineLayoutCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipelineLayout*                           pPipelineLayout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_pipeline_layout *layout;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO);
+
+   layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout),
+                            VK_OBJECT_TYPE_PIPELINE_LAYOUT);
+   if (layout == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   layout->num_sets = pCreateInfo->setLayoutCount;
+
+   unsigned dynamic_offset_count = 0;
+
+   for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
+      ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout,
+                      pCreateInfo->pSetLayouts[set]);
+      layout->set[set].layout = set_layout;
+      anv_descriptor_set_layout_ref(set_layout);
+
+      layout->set[set].dynamic_offset_start = dynamic_offset_count;
+      dynamic_offset_count += set_layout->dynamic_offset_count;
+   }
+   assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS);
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+                        sizeof(layout->set[s].dynamic_offset_start));
+   }
+   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+   _mesa_sha1_final(&ctx, layout->sha1);
+
+   *pPipelineLayout = anv_pipeline_layout_to_handle(layout);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyPipelineLayout(
+    VkDevice                                    _device,
+    VkPipelineLayout                            _pipelineLayout,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, _pipelineLayout);
+
+   if (!pipeline_layout)
+      return;
+
+   for (uint32_t i = 0; i < pipeline_layout->num_sets; i++)
+      anv_descriptor_set_layout_unref(device, pipeline_layout->set[i].layout);
+
+   vk_object_free(&device->vk, pAllocator, pipeline_layout);
+}
+
+/*
+ * Descriptor pools.
+ *
+ * These are implemented using a big pool of memory and a free-list for the
+ * host memory allocations and a state_stream and a free list for the buffer
+ * view surface state. The spec allows us to fail to allocate due to
+ * fragmentation in all cases but two: 1) after pool reset, allocating up
+ * until the pool size with no freeing must succeed and 2) allocating and
+ * freeing only descriptor sets with the same layout. Case 1) is easy enough,
+ * and the free lists lets us recycle blocks for case 2).
+ */
+
+/* The vma heap reserves 0 to mean NULL; we have to offset by some amount to
+ * ensure we can allocate the entire BO without hitting zero.  The actual
+ * amount doesn't matter.
+ */
+#define POOL_HEAP_OFFSET 64
+
+#define EMPTY 1
+
+VkResult anv_CreateDescriptorPool(
+    VkDevice                                    _device,
+    const VkDescriptorPoolCreateInfo*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorPool*                           pDescriptorPool)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_descriptor_pool *pool;
+
+   const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoVALVE *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE);
+
+   uint32_t descriptor_count = 0;
+   uint32_t buffer_view_count = 0;
+   uint32_t descriptor_bo_size = 0;
+
+   for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) {
+      enum anv_descriptor_data desc_data =
+         pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+         anv_descriptor_data_for_mutable_type(device->physical, mutable_info, i) :
+         anv_descriptor_data_for_type(device->physical, pCreateInfo->pPoolSizes[i].type);
+
+      if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW)
+         buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+
+      unsigned desc_data_size =
+         pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+         anv_descriptor_size_for_mutable_type(device->physical, mutable_info, i) :
+         anv_descriptor_data_size(desc_data);
+
+      desc_data_size *= pCreateInfo->pPoolSizes[i].descriptorCount;
+
+      /* Combined image sampler descriptors can take up to 3 slots if they
+       * hold a YCbCr image.
+       */
+      if (pCreateInfo->pPoolSizes[i].type ==
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         desc_data_size *= 3;
+
+      if (pCreateInfo->pPoolSizes[i].type ==
+          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* Inline uniform blocks are specified to use the descriptor array
+          * size as the size in bytes of the block.
+          */
+         assert(inline_info);
+         desc_data_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+      }
+
+      descriptor_bo_size += desc_data_size;
+
+      descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+   }
+   /* We have to align descriptor buffer allocations to 32B so that we can
+    * push descriptor buffers.  This means that each descriptor buffer
+    * allocated may burn up to 32B of extra space to get the right alignment.
+    * (Technically, it's at most 28B because we're always going to start at
+    * least 4B aligned but we're being conservative here.)  Allocate enough
+    * extra space that we can chop it into maxSets pieces and align each one
+    * of them to 32B.
+    */
+   descriptor_bo_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets;
+   /* We align inline uniform blocks to ANV_UBO_ALIGNMENT */
+   if (inline_info) {
+      descriptor_bo_size +=
+         ANV_UBO_ALIGNMENT * inline_info->maxInlineUniformBlockBindings;
+   }
+   descriptor_bo_size = ALIGN(descriptor_bo_size, 4096);
+
+   const size_t pool_size =
+      pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
+      descriptor_count * sizeof(struct anv_descriptor) +
+      buffer_view_count * sizeof(struct anv_buffer_view);
+   const size_t total_size = sizeof(*pool) + pool_size;
+
+   pool = vk_object_alloc(&device->vk, pAllocator, total_size,
+                          VK_OBJECT_TYPE_DESCRIPTOR_POOL);
+   if (!pool)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pool->size = pool_size;
+   pool->next = 0;
+   pool->free_list = EMPTY;
+   pool->host_only = pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_VALVE;
+
+   if (descriptor_bo_size > 0) {
+      VkResult result = anv_device_alloc_bo(device,
+                                            "descriptors",
+                                            descriptor_bo_size,
+                                            ANV_BO_ALLOC_MAPPED |
+                                            ANV_BO_ALLOC_SNOOPED,
+                                            0 /* explicit_address */,
+                                            &pool->bo);
+      if (result != VK_SUCCESS) {
+         vk_object_free(&device->vk, pAllocator, pool);
+         return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      }
+
+      util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size);
+   } else {
+      pool->bo = NULL;
+   }
+
+   anv_state_stream_init(&pool->surface_state_stream,
+                         &device->surface_state_pool, 4096);
+   pool->surface_state_free_list = NULL;
+
+   list_inithead(&pool->desc_sets);
+
+   *pDescriptorPool = anv_descriptor_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyDescriptorPool(
+    VkDevice                                    _device,
+    VkDescriptorPool                            _pool,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_pool, pool, _pool);
+
+   if (!pool)
+      return;
+
+   list_for_each_entry_safe(struct anv_descriptor_set, set,
+                            &pool->desc_sets, pool_link) {
+      anv_descriptor_set_layout_unref(device, set->layout);
+   }
+
+   if (pool->bo) {
+      util_vma_heap_finish(&pool->bo_heap);
+      anv_device_release_bo(device, pool->bo);
+   }
+   anv_state_stream_finish(&pool->surface_state_stream);
+
+   vk_object_free(&device->vk, pAllocator, pool);
+}
+
+VkResult anv_ResetDescriptorPool(
+    VkDevice                                    _device,
+    VkDescriptorPool                            descriptorPool,
+    VkDescriptorPoolResetFlags                  flags)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool);
+
+   list_for_each_entry_safe(struct anv_descriptor_set, set,
+                            &pool->desc_sets, pool_link) {
+      anv_descriptor_set_layout_unref(device, set->layout);
+   }
+   list_inithead(&pool->desc_sets);
+
+   pool->next = 0;
+   pool->free_list = EMPTY;
+
+   if (pool->bo) {
+      util_vma_heap_finish(&pool->bo_heap);
+      util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo->size);
+   }
+
+   anv_state_stream_finish(&pool->surface_state_stream);
+   anv_state_stream_init(&pool->surface_state_stream,
+                         &device->surface_state_pool, 4096);
+   pool->surface_state_free_list = NULL;
+
+   return VK_SUCCESS;
+}
+
+struct pool_free_list_entry {
+   uint32_t next;
+   uint32_t size;
+};
+
+static VkResult
+anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool,
+                              uint32_t size,
+                              struct anv_descriptor_set **set)
+{
+   if (size <= pool->size - pool->next) {
+      *set = (struct anv_descriptor_set *) (pool->data + pool->next);
+      (*set)->size = size;
+      pool->next += size;
+      return VK_SUCCESS;
+   } else {
+      struct pool_free_list_entry *entry;
+      uint32_t *link = &pool->free_list;
+      for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) {
+         entry = (struct pool_free_list_entry *) (pool->data + f);
+         if (size <= entry->size) {
+            *link = entry->next;
+            *set = (struct anv_descriptor_set *) entry;
+            (*set)->size = entry->size;
+            return VK_SUCCESS;
+         }
+         link = &entry->next;
+      }
+
+      if (pool->free_list != EMPTY) {
+         return VK_ERROR_FRAGMENTED_POOL;
+      } else {
+         return VK_ERROR_OUT_OF_POOL_MEMORY;
+      }
+   }
+}
+
+static void
+anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool,
+                             struct anv_descriptor_set *set)
+{
+   /* Put the descriptor set allocation back on the free list. */
+   const uint32_t index = (char *) set - pool->data;
+   if (index + set->size == pool->next) {
+      pool->next = index;
+   } else {
+      struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set;
+      entry->next = pool->free_list;
+      entry->size = set->size;
+      pool->free_list = (char *) entry - pool->data;
+   }
+}
+
+struct surface_state_free_list_entry {
+   void *next;
+   struct anv_state state;
+};
+
+static struct anv_state
+anv_descriptor_pool_alloc_state(struct anv_descriptor_pool *pool)
+{
+   assert(!pool->host_only);
+
+   struct surface_state_free_list_entry *entry =
+      pool->surface_state_free_list;
+
+   if (entry) {
+      struct anv_state state = entry->state;
+      pool->surface_state_free_list = entry->next;
+      assert(state.alloc_size == 64);
+      return state;
+   } else {
+      return anv_state_stream_alloc(&pool->surface_state_stream, 64, 64);
+   }
+}
+
+static void
+anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool,
+                               struct anv_state state)
+{
+   assert(state.alloc_size);
+   /* Put the buffer view surface state back on the free list. */
+   struct surface_state_free_list_entry *entry = state.map;
+   entry->next = pool->surface_state_free_list;
+   entry->state = state;
+   pool->surface_state_free_list = entry;
+}
+
+size_t
+anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
+                               uint32_t var_desc_count)
+{
+   const uint32_t descriptor_count =
+      set_layout_descriptor_count(layout, var_desc_count);
+   const uint32_t buffer_view_count =
+      set_layout_buffer_view_count(layout, var_desc_count);
+
+   return sizeof(struct anv_descriptor_set) +
+          descriptor_count * sizeof(struct anv_descriptor) +
+          buffer_view_count * sizeof(struct anv_buffer_view);
+}
+
+static VkResult
+anv_descriptor_set_create(struct anv_device *device,
+                          struct anv_descriptor_pool *pool,
+                          struct anv_descriptor_set_layout *layout,
+                          uint32_t var_desc_count,
+                          struct anv_descriptor_set **out_set)
+{
+   struct anv_descriptor_set *set;
+   const size_t size = anv_descriptor_set_layout_size(layout, var_desc_count);
+
+   VkResult result = anv_descriptor_pool_alloc_set(pool, size, &set);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint32_t descriptor_buffer_size =
+      anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count);
+
+   set->desc_surface_state = ANV_STATE_NULL;
+
+   if (descriptor_buffer_size) {
+      uint64_t pool_vma_offset =
+         util_vma_heap_alloc(&pool->bo_heap, descriptor_buffer_size,
+                             ANV_UBO_ALIGNMENT);
+      if (pool_vma_offset == 0) {
+         anv_descriptor_pool_free_set(pool, set);
+         return vk_error(pool, VK_ERROR_FRAGMENTED_POOL);
+      }
+      assert(pool_vma_offset >= POOL_HEAP_OFFSET &&
+             pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX);
+      set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET;
+      set->desc_mem.alloc_size = descriptor_buffer_size;
+      set->desc_mem.map = pool->bo->map + set->desc_mem.offset;
+
+      set->desc_addr = (struct anv_address) {
+         .bo = pool->bo,
+         .offset = set->desc_mem.offset,
+      };
+
+      enum isl_format format =
+         anv_isl_format_for_descriptor_type(device,
+                                            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+
+      if (!pool->host_only) {
+         set->desc_surface_state = anv_descriptor_pool_alloc_state(pool);
+         anv_fill_buffer_surface_state(device, set->desc_surface_state,
+                                       format, ISL_SWIZZLE_IDENTITY,
+                                       ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                       set->desc_addr,
+                                       descriptor_buffer_size, 1);
+      }
+   } else {
+      set->desc_mem = ANV_STATE_NULL;
+      set->desc_addr = (struct anv_address) { .bo = NULL, .offset = 0 };
+   }
+
+   vk_object_base_init(&device->vk, &set->base,
+                       VK_OBJECT_TYPE_DESCRIPTOR_SET);
+   set->pool = pool;
+   set->layout = layout;
+   anv_descriptor_set_layout_ref(layout);
+
+   set->buffer_view_count =
+      set_layout_buffer_view_count(layout, var_desc_count);
+   set->descriptor_count =
+      set_layout_descriptor_count(layout, var_desc_count);
+
+   set->buffer_views =
+      (struct anv_buffer_view *) &set->descriptors[set->descriptor_count];
+
+   /* By defining the descriptors to be zero now, we can later verify that
+    * a descriptor has not been populated with user data.
+    */
+   memset(set->descriptors, 0,
+          sizeof(struct anv_descriptor) * set->descriptor_count);
+
+   /* Go through and fill out immutable samplers if we have any */
+   for (uint32_t b = 0; b < layout->binding_count; b++) {
+      if (layout->binding[b].immutable_samplers) {
+         for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
+            /* The type will get changed to COMBINED_IMAGE_SAMPLER in
+             * UpdateDescriptorSets if needed.  However, if the descriptor
+             * set has an immutable sampler, UpdateDescriptorSets may never
+             * touch it, so we need to make sure it's 100% valid now.
+             *
+             * We don't need to actually provide a sampler because the helper
+             * will always write in the immutable sampler regardless of what
+             * is in the sampler parameter.
+             */
+            VkDescriptorImageInfo info = { };
+            anv_descriptor_set_write_image_view(device, set, &info,
+                                                VK_DESCRIPTOR_TYPE_SAMPLER,
+                                                b, i);
+         }
+      }
+   }
+
+   /* Allocate null surface state for the buffer views since
+    * we lazy allocate this in the write anyway.
+    */
+   if (!pool->host_only) {
+      for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+         set->buffer_views[b].surface_state =
+            anv_descriptor_pool_alloc_state(pool);
+      }
+   }
+
+   list_addtail(&set->pool_link, &pool->desc_sets);
+
+   *out_set = set;
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_set_destroy(struct anv_device *device,
+                           struct anv_descriptor_pool *pool,
+                           struct anv_descriptor_set *set)
+{
+   anv_descriptor_set_layout_unref(device, set->layout);
+
+   if (set->desc_mem.alloc_size) {
+      util_vma_heap_free(&pool->bo_heap,
+                         (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET,
+                         set->desc_mem.alloc_size);
+      if (set->desc_surface_state.alloc_size)
+         anv_descriptor_pool_free_state(pool, set->desc_surface_state);
+   }
+
+   if (!pool->host_only) {
+      for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+         if (set->buffer_views[b].surface_state.alloc_size)
+            anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state);
+      }
+   }
+
+   list_del(&set->pool_link);
+
+   vk_object_base_finish(&set->base);
+   anv_descriptor_pool_free_set(pool, set);
+}
+
+VkResult anv_AllocateDescriptorSets(
+    VkDevice                                    _device,
+    const VkDescriptorSetAllocateInfo*          pAllocateInfo,
+    VkDescriptorSet*                            pDescriptorSets)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_pool, pool, pAllocateInfo->descriptorPool);
+
+   VkResult result = VK_SUCCESS;
+   struct anv_descriptor_set *set = NULL;
+   uint32_t i;
+
+   const VkDescriptorSetVariableDescriptorCountAllocateInfo *vdcai =
+      vk_find_struct_const(pAllocateInfo->pNext,
+                           DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_ALLOCATE_INFO);
+
+   for (i = 0; i < pAllocateInfo->descriptorSetCount; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set_layout, layout,
+                      pAllocateInfo->pSetLayouts[i]);
+
+      uint32_t var_desc_count = 0;
+      if (vdcai != NULL && vdcai->descriptorSetCount > 0) {
+         assert(vdcai->descriptorSetCount == pAllocateInfo->descriptorSetCount);
+         var_desc_count = vdcai->pDescriptorCounts[i];
+      }
+
+      result = anv_descriptor_set_create(device, pool, layout,
+                                         var_desc_count, &set);
+      if (result != VK_SUCCESS)
+         break;
+
+      pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
+   }
+
+   if (result != VK_SUCCESS)
+      anv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool,
+                             i, pDescriptorSets);
+
+   return result;
+}
+
+VkResult anv_FreeDescriptorSets(
+    VkDevice                                    _device,
+    VkDescriptorPool                            descriptorPool,
+    uint32_t                                    count,
+    const VkDescriptorSet*                      pDescriptorSets)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_pool, pool, descriptorPool);
+
+   for (uint32_t i = 0; i < count; i++) {
+      ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
+
+      if (!set)
+         continue;
+
+      anv_descriptor_set_destroy(device, pool, set);
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_set_write_image_param(uint32_t *param_desc_map,
+                                     const struct brw_image_param *param)
+{
+#define WRITE_PARAM_FIELD(field, FIELD) \
+   for (unsigned i = 0; i < ARRAY_SIZE(param->field); i++) \
+      param_desc_map[BRW_IMAGE_PARAM_##FIELD##_OFFSET + i] = param->field[i]
+
+   WRITE_PARAM_FIELD(offset, OFFSET);
+   WRITE_PARAM_FIELD(size, SIZE);
+   WRITE_PARAM_FIELD(stride, STRIDE);
+   WRITE_PARAM_FIELD(tiling, TILING);
+   WRITE_PARAM_FIELD(swizzling, SWIZZLING);
+   WRITE_PARAM_FIELD(size, SIZE);
+
+#undef WRITE_PARAM_FIELD
+}
+
+static uint32_t
+anv_surface_state_to_handle(struct anv_state state)
+{
+   /* Bits 31:12 of the bindless surface offset in the extended message
+    * descriptor is bits 25:6 of the byte-based address.
+    */
+   assert(state.offset >= 0);
+   uint32_t offset = state.offset;
+   assert((offset & 0x3f) == 0 && offset < (1 << 26));
+   return offset << 6;
+}
+
+void
+anv_descriptor_set_write_image_view(struct anv_device *device,
+                                    struct anv_descriptor_set *set,
+                                    const VkDescriptorImageInfo * const info,
+                                    VkDescriptorType type,
+                                    uint32_t binding,
+                                    uint32_t element)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set->layout->binding[binding];
+   struct anv_descriptor *desc =
+      &set->descriptors[bind_layout->descriptor_index + element];
+   struct anv_image_view *image_view = NULL;
+   struct anv_sampler *sampler = NULL;
+
+   /* We get called with just VK_DESCRIPTOR_TYPE_SAMPLER as part of descriptor
+    * set initialization to set the bindless samplers.
+    */
+   assert(type == bind_layout->type ||
+          type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE);
+
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      sampler = bind_layout->immutable_samplers ?
+                bind_layout->immutable_samplers[element] :
+                anv_sampler_from_handle(info->sampler);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      image_view = anv_image_view_from_handle(info->imageView);
+      sampler = bind_layout->immutable_samplers ?
+                bind_layout->immutable_samplers[element] :
+                anv_sampler_from_handle(info->sampler);
+      break;
+
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+      image_view = anv_image_view_from_handle(info->imageView);
+      break;
+
+   default:
+      unreachable("invalid descriptor type");
+   }
+
+   *desc = (struct anv_descriptor) {
+      .type = type,
+      .layout = info->imageLayout,
+      .image_view = image_view,
+      .sampler = sampler,
+   };
+
+   if (set->pool && set->pool->host_only)
+      return;
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
+                    element * bind_layout->descriptor_stride;
+   memset(desc_map, 0, bind_layout->descriptor_stride);
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+      anv_descriptor_data_for_type(device->physical, type) :
+      bind_layout->data;
+
+
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      struct anv_sampled_image_descriptor desc_data[3];
+      memset(desc_data, 0, sizeof(desc_data));
+
+      if (image_view) {
+         for (unsigned p = 0; p < image_view->n_planes; p++) {
+            struct anv_surface_state sstate =
+               (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
+               image_view->planes[p].general_sampler_surface_state :
+               image_view->planes[p].optimal_sampler_surface_state;
+            desc_data[p].image = anv_surface_state_to_handle(sstate.state);
+         }
+      }
+
+      if (sampler) {
+         for (unsigned p = 0; p < sampler->n_planes; p++)
+            desc_data[p].sampler = sampler->bindless_state.offset + p * 32;
+      }
+
+      /* We may have max_plane_count < 0 if this isn't a sampled image but it
+       * can be no more than the size of our array of handles.
+       */
+      assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data));
+      memcpy(desc_map, desc_data,
+             MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
+   }
+
+   if (image_view == NULL)
+      return;
+
+   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
+      assert(!(data & ANV_DESCRIPTOR_IMAGE_PARAM));
+      assert(image_view->n_planes == 1);
+      struct anv_storage_image_descriptor desc_data = {
+         .vanilla = anv_surface_state_to_handle(
+                           image_view->planes[0].storage_surface_state.state),
+         .lowered = anv_surface_state_to_handle(
+                           image_view->planes[0].lowered_storage_surface_state.state),
+      };
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
+   if (data & ANV_DESCRIPTOR_IMAGE_PARAM) {
+      /* Storage images can only ever have one plane */
+      assert(image_view->n_planes == 1);
+      const struct brw_image_param *image_param =
+         &image_view->planes[0].lowered_storage_image_param;
+
+      anv_descriptor_set_write_image_param(desc_map, image_param);
+   }
+
+   if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) {
+      assert(!(data & ANV_DESCRIPTOR_SAMPLED_IMAGE));
+      assert(image_view);
+      struct anv_texture_swizzle_descriptor desc_data[3];
+      memset(desc_data, 0, sizeof(desc_data));
+
+      for (unsigned p = 0; p < image_view->n_planes; p++) {
+         desc_data[p] = (struct anv_texture_swizzle_descriptor) {
+            .swizzle = {
+               (uint8_t)image_view->planes[p].isl.swizzle.r,
+               (uint8_t)image_view->planes[p].isl.swizzle.g,
+               (uint8_t)image_view->planes[p].isl.swizzle.b,
+               (uint8_t)image_view->planes[p].isl.swizzle.a,
+            },
+         };
+      }
+      memcpy(desc_map, desc_data,
+             MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
+   }
+}
+
+void
+anv_descriptor_set_write_buffer_view(struct anv_device *device,
+                                     struct anv_descriptor_set *set,
+                                     VkDescriptorType type,
+                                     struct anv_buffer_view *buffer_view,
+                                     uint32_t binding,
+                                     uint32_t element)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set->layout->binding[binding];
+   struct anv_descriptor *desc =
+      &set->descriptors[bind_layout->descriptor_index + element];
+
+   assert(type == bind_layout->type ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE);
+
+   *desc = (struct anv_descriptor) {
+      .type = type,
+      .buffer_view = buffer_view,
+   };
+
+   if (set->pool && set->pool->host_only)
+      return;
+
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+      anv_descriptor_data_for_type(device->physical, type) :
+      bind_layout->data;
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
+                    element * bind_layout->descriptor_stride;
+
+   if (buffer_view == NULL) {
+      memset(desc_map, 0, bind_layout->descriptor_stride);
+      return;
+   }
+
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      struct anv_sampled_image_descriptor desc_data = {
+         .image = anv_surface_state_to_handle(buffer_view->surface_state),
+      };
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
+   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
+      assert(!(data & ANV_DESCRIPTOR_IMAGE_PARAM));
+      struct anv_storage_image_descriptor desc_data = {
+         .vanilla = anv_surface_state_to_handle(
+                           buffer_view->storage_surface_state),
+         .lowered = anv_surface_state_to_handle(
+                           buffer_view->lowered_storage_surface_state),
+      };
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
+   if (data & ANV_DESCRIPTOR_IMAGE_PARAM) {
+      anv_descriptor_set_write_image_param(desc_map,
+         &buffer_view->lowered_storage_image_param);
+   }
+}
+
+void
+anv_descriptor_set_write_buffer(struct anv_device *device,
+                                struct anv_descriptor_set *set,
+                                struct anv_state_stream *alloc_stream,
+                                VkDescriptorType type,
+                                struct anv_buffer *buffer,
+                                uint32_t binding,
+                                uint32_t element,
+                                VkDeviceSize offset,
+                                VkDeviceSize range)
+{
+   assert(alloc_stream || set->pool);
+
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set->layout->binding[binding];
+   struct anv_descriptor *desc =
+      &set->descriptors[bind_layout->descriptor_index + element];
+
+   assert(type == bind_layout->type ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE);
+
+   *desc = (struct anv_descriptor) {
+      .type = type,
+      .offset = offset,
+      .range = range,
+      .buffer = buffer,
+   };
+
+   if (set->pool && set->pool->host_only)
+      return;
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
+                    element * bind_layout->descriptor_stride;
+
+   if (buffer == NULL) {
+      memset(desc_map, 0, bind_layout->descriptor_stride);
+      return;
+   }
+
+   struct anv_address bind_addr = anv_address_add(buffer->address, offset);
+   uint64_t bind_range = vk_buffer_range(&buffer->vk, offset, range);
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_VALVE ?
+      anv_descriptor_data_for_type(device->physical, type) :
+      bind_layout->data;
+
+   /* We report a bounds checking alignment of 32B for the sake of block
+    * messages which read an entire register worth at a time.
+    */
+   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+       type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+      bind_range = align_u64(bind_range, ANV_UBO_ALIGNMENT);
+
+   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
+      struct anv_address_range_descriptor desc_data = {
+         .address = anv_address_physical(bind_addr),
+         .range = bind_range,
+      };
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
+   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return;
+
+   assert(data & ANV_DESCRIPTOR_BUFFER_VIEW);
+   struct anv_buffer_view *bview =
+      &set->buffer_views[bind_layout->buffer_view_index + element];
+
+   bview->range = bind_range;
+   bview->address = bind_addr;
+
+   /* If we're writing descriptors through a push command, we need to
+      * allocate the surface state from the command buffer. Otherwise it will
+      * be allocated by the descriptor pool when calling
+      * vkAllocateDescriptorSets. */
+   if (alloc_stream) {
+      bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
+   }
+
+   assert(bview->surface_state.alloc_size);
+
+   isl_surf_usage_flags_t usage =
+      (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+       type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ?
+      ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
+      ISL_SURF_USAGE_STORAGE_BIT;
+
+   enum isl_format format = anv_isl_format_for_descriptor_type(device, type);
+   anv_fill_buffer_surface_state(device, bview->surface_state,
+                                 format, ISL_SWIZZLE_IDENTITY,
+                                 usage, bind_addr, bind_range, 1);
+   desc->set_buffer_view = bview;
+}
+
+void
+anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
+                                             struct anv_descriptor_set *set,
+                                             uint32_t binding,
+                                             const void *data,
+                                             size_t offset,
+                                             size_t size)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set->layout->binding[binding];
+
+   assert(bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM);
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset;
+
+   memcpy(desc_map + offset, data, size);
+}
+
+void
+anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
+                                                struct anv_descriptor_set *set,
+                                                struct anv_acceleration_structure *accel,
+                                                uint32_t binding,
+                                                uint32_t element)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set->layout->binding[binding];
+   struct anv_descriptor *desc =
+      &set->descriptors[bind_layout->descriptor_index + element];
+
+   assert(bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE);
+   *desc = (struct anv_descriptor) {
+      .type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR,
+      .accel_struct = accel,
+   };
+
+   if (set->pool && set->pool->host_only)
+      return;
+
+   struct anv_address_range_descriptor desc_data = { };
+   if (accel != NULL) {
+      desc_data.address = anv_address_physical(accel->address);
+      desc_data.range = accel->size;
+   }
+   assert(sizeof(desc_data) <= bind_layout->descriptor_stride);
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
+                    element * bind_layout->descriptor_stride;
+   memcpy(desc_map, &desc_data, sizeof(desc_data));
+}
+
+void anv_UpdateDescriptorSets(
+    VkDevice                                    _device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   for (uint32_t i = 0; i < descriptorWriteCount; i++) {
+      const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
+      ANV_FROM_HANDLE(anv_descriptor_set, set, write->dstSet);
+
+      switch (write->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            anv_descriptor_set_write_image_view(device, set,
+                                                write->pImageInfo + j,
+                                                write->descriptorType,
+                                                write->dstBinding,
+                                                write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_buffer_view, bview,
+                            write->pTexelBufferView[j]);
+
+            anv_descriptor_set_write_buffer_view(device, set,
+                                                 write->descriptorType,
+                                                 bview,
+                                                 write->dstBinding,
+                                                 write->dstArrayElement + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
+
+            anv_descriptor_set_write_buffer(device, set,
+                                            NULL,
+                                            write->descriptorType,
+                                            buffer,
+                                            write->dstBinding,
+                                            write->dstArrayElement + j,
+                                            write->pBufferInfo[j].offset,
+                                            write->pBufferInfo[j].range);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+         const VkWriteDescriptorSetInlineUniformBlock *inline_write =
+            vk_find_struct_const(write->pNext,
+                                 WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+         assert(inline_write->dataSize == write->descriptorCount);
+         anv_descriptor_set_write_inline_uniform_data(device, set,
+                                                      write->dstBinding,
+                                                      inline_write->pData,
+                                                      write->dstArrayElement,
+                                                      inline_write->dataSize);
+         break;
+      }
+
+      case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+         const VkWriteDescriptorSetAccelerationStructureKHR *accel_write =
+            vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR);
+         assert(accel_write->accelerationStructureCount ==
+                write->descriptorCount);
+         for (uint32_t j = 0; j < write->descriptorCount; j++) {
+            ANV_FROM_HANDLE(anv_acceleration_structure, accel,
+                            accel_write->pAccelerationStructures[j]);
+            anv_descriptor_set_write_acceleration_structure(device, set, accel,
+                                                            write->dstBinding,
+                                                            write->dstArrayElement + j);
+         }
+         break;
+      }
+
+      default:
+         break;
+      }
+   }
+
+   for (uint32_t i = 0; i < descriptorCopyCount; i++) {
+      const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
+      ANV_FROM_HANDLE(anv_descriptor_set, src, copy->srcSet);
+      ANV_FROM_HANDLE(anv_descriptor_set, dst, copy->dstSet);
+
+      const struct anv_descriptor_set_binding_layout *src_layout =
+         &src->layout->binding[copy->srcBinding];
+      struct anv_descriptor *src_desc =
+         &src->descriptors[src_layout->descriptor_index];
+      src_desc += copy->srcArrayElement;
+
+      if (src_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         anv_descriptor_set_write_inline_uniform_data(device, dst,
+                                                      copy->dstBinding,
+                                                      src->desc_mem.map + src_layout->descriptor_offset + copy->srcArrayElement,
+                                                      copy->dstArrayElement,
+                                                      copy->descriptorCount);
+         continue;
+      }
+
+
+      /* Copy CPU side data */
+      for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+         switch(src_desc[j].type) {
+         case VK_DESCRIPTOR_TYPE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+            VkDescriptorImageInfo info = {
+               .sampler = anv_sampler_to_handle(src_desc[j].sampler),
+               .imageView = anv_image_view_to_handle(src_desc[j].image_view),
+               .imageLayout = src_desc[j].layout
+            };
+            anv_descriptor_set_write_image_view(device, dst,
+                                                &info,
+                                                src_desc[j].type,
+                                                copy->dstBinding,
+                                                copy->dstArrayElement + j);
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
+            anv_descriptor_set_write_buffer_view(device, dst,
+                                                 src_desc[j].type,
+                                                 src_desc[j].buffer_view,
+                                                 copy->dstBinding,
+                                                 copy->dstArrayElement + j);
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+            anv_descriptor_set_write_buffer(device, dst,
+                                            NULL,
+                                            src_desc[j].type,
+                                            src_desc[j].buffer,
+                                            copy->dstBinding,
+                                            copy->dstArrayElement + j,
+                                            src_desc[j].offset,
+                                            src_desc[j].range);
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+            anv_descriptor_set_write_acceleration_structure(device, dst,
+                                                            src_desc[j].accel_struct,
+                                                            copy->dstBinding,
+                                                            copy->dstArrayElement + j);
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+}
+
+/*
+ * Descriptor update templates.
+ */
+
+void
+anv_descriptor_set_write_template(struct anv_device *device,
+                                  struct anv_descriptor_set *set,
+                                  struct anv_state_stream *alloc_stream,
+                                  const struct anv_descriptor_update_template *template,
+                                  const void *data)
+{
+   for (uint32_t i = 0; i < template->entry_count; i++) {
+      const struct anv_descriptor_template_entry *entry =
+         &template->entries[i];
+
+      switch (entry->type) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorImageInfo *info =
+               data + entry->offset + j * entry->stride;
+            anv_descriptor_set_write_image_view(device, set,
+                                                info, entry->type,
+                                                entry->binding,
+                                                entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkBufferView *_bview =
+               data + entry->offset + j * entry->stride;
+            ANV_FROM_HANDLE(anv_buffer_view, bview, *_bview);
+
+            anv_descriptor_set_write_buffer_view(device, set,
+                                                 entry->type,
+                                                 bview,
+                                                 entry->binding,
+                                                 entry->array_element + j);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            const VkDescriptorBufferInfo *info =
+               data + entry->offset + j * entry->stride;
+            ANV_FROM_HANDLE(anv_buffer, buffer, info->buffer);
+
+            anv_descriptor_set_write_buffer(device, set,
+                                            alloc_stream,
+                                            entry->type,
+                                            buffer,
+                                            entry->binding,
+                                            entry->array_element + j,
+                                            info->offset, info->range);
+         }
+         break;
+
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+         anv_descriptor_set_write_inline_uniform_data(device, set,
+                                                      entry->binding,
+                                                      data + entry->offset,
+                                                      entry->array_element,
+                                                      entry->array_count);
+         break;
+
+      case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            VkAccelerationStructureKHR *accel_obj =
+               (VkAccelerationStructureKHR *)(data + entry->offset + j * entry->stride);
+            ANV_FROM_HANDLE(anv_acceleration_structure, accel, *accel_obj);
+
+            anv_descriptor_set_write_acceleration_structure(device, set,
+                                                            accel,
+                                                            entry->binding,
+                                                            entry->array_element + j);
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+}
+
+VkResult anv_CreateDescriptorUpdateTemplate(
+    VkDevice                                    _device,
+    const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDescriptorUpdateTemplate*                 pDescriptorUpdateTemplate)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_descriptor_update_template *template;
+
+   size_t size = sizeof(*template) +
+      pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
+   template = vk_object_alloc(&device->vk, pAllocator, size,
+                              VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
+   if (template == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   template->bind_point = pCreateInfo->pipelineBindPoint;
+
+   if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET)
+      template->set = pCreateInfo->set;
+
+   template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
+   for (uint32_t i = 0; i < template->entry_count; i++) {
+      const VkDescriptorUpdateTemplateEntry *pEntry =
+         &pCreateInfo->pDescriptorUpdateEntries[i];
+
+      template->entries[i] = (struct anv_descriptor_template_entry) {
+         .type = pEntry->descriptorType,
+         .binding = pEntry->dstBinding,
+         .array_element = pEntry->dstArrayElement,
+         .array_count = pEntry->descriptorCount,
+         .offset = pEntry->offset,
+         .stride = pEntry->stride,
+      };
+   }
+
+   *pDescriptorUpdateTemplate =
+      anv_descriptor_update_template_to_handle(template);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyDescriptorUpdateTemplate(
+    VkDevice                                    _device,
+    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
+                   descriptorUpdateTemplate);
+
+   if (!template)
+      return;
+
+   vk_object_free(&device->vk, pAllocator, template);
+}
+
+void anv_UpdateDescriptorSetWithTemplate(
+    VkDevice                                    _device,
+    VkDescriptorSet                             descriptorSet,
+    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
+    const void*                                 pData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet);
+   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
+                   descriptorUpdateTemplate);
+
+   anv_descriptor_set_write_template(device, set, NULL, template, pData);
+}
diff --git a/src/intel/vulkan_hasvk/anv_device.c b/src/intel/vulkan_hasvk/anv_device.c
new file mode 100644
index 00000000000..5c833e9f8d3
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_device.c
@@ -0,0 +1,4834 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+#ifdef MAJOR_IN_MKDEV
+#include <sys/mkdev.h>
+#endif
+#ifdef MAJOR_IN_SYSMACROS
+#include <sys/sysmacros.h>
+#endif
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "drm-uapi/drm_fourcc.h"
+#include "drm-uapi/drm.h"
+#include <xf86drm.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "util/debug.h"
+#include "util/build_id.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+#include "util/os_file.h"
+#include "util/os_misc.h"
+#include "util/u_atomic.h"
+#include "util/u_string.h"
+#include "util/driconf.h"
+#include "git_sha1.h"
+#include "vk_util.h"
+#include "vk_deferred_operation.h"
+#include "vk_drm_syncobj.h"
+#include "common/intel_aux_map.h"
+#include "common/intel_defines.h"
+#include "common/intel_uuid.h"
+#include "perf/intel_perf.h"
+
+#include "genxml/gen7_pack.h"
+#include "genxml/genX_bits.h"
+
+static const driOptionDescription anv_dri_options[] = {
+   DRI_CONF_SECTION_PERFORMANCE
+      DRI_CONF_ADAPTIVE_SYNC(true)
+      DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
+      DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
+      DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
+      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(false)
+      DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
+   DRI_CONF_SECTION_END
+
+   DRI_CONF_SECTION_DEBUG
+      DRI_CONF_ALWAYS_FLUSH_CACHE(false)
+      DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
+      DRI_CONF_LIMIT_TRIG_INPUT_RANGE(false)
+   DRI_CONF_SECTION_END
+};
+
+/* This is probably far to big but it reflects the max size used for messages
+ * in OpenGLs KHR_debug.
+ */
+#define MAX_DEBUG_MESSAGE_LENGTH    4096
+
+/* Render engine timestamp register */
+#define TIMESTAMP 0x2358
+
+/* The "RAW" clocks on Linux are called "FAST" on FreeBSD */
+#if !defined(CLOCK_MONOTONIC_RAW) && defined(CLOCK_MONOTONIC_FAST)
+#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST
+#endif
+
+static void
+compiler_debug_log(void *data, UNUSED unsigned *id, const char *fmt, ...)
+{
+   char str[MAX_DEBUG_MESSAGE_LENGTH];
+   struct anv_device *device = (struct anv_device *)data;
+   UNUSED struct anv_instance *instance = device->physical->instance;
+
+   va_list args;
+   va_start(args, fmt);
+   (void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args);
+   va_end(args);
+
+   //vk_logd(VK_LOG_NO_OBJS(&instance->vk), "%s", str);
+}
+
+static void
+compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+
+   if (INTEL_DEBUG(DEBUG_PERF))
+      mesa_logd_v(fmt, args);
+
+   va_end(args);
+}
+
+#if defined(VK_USE_PLATFORM_WAYLAND_KHR) || \
+    defined(VK_USE_PLATFORM_XCB_KHR) || \
+    defined(VK_USE_PLATFORM_XLIB_KHR) || \
+    defined(VK_USE_PLATFORM_DISPLAY_KHR)
+#define ANV_USE_WSI_PLATFORM
+#endif
+
+#ifdef ANDROID
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#else
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
+#endif
+
+VkResult anv_EnumerateInstanceVersion(
+    uint32_t*                                   pApiVersion)
+{
+    *pApiVersion = ANV_API_VERSION;
+    return VK_SUCCESS;
+}
+
+static const struct vk_instance_extension_table instance_extensions = {
+   .KHR_device_group_creation                = true,
+   .KHR_external_fence_capabilities          = true,
+   .KHR_external_memory_capabilities         = true,
+   .KHR_external_semaphore_capabilities      = true,
+   .KHR_get_physical_device_properties2      = true,
+   .EXT_debug_report                         = true,
+   .EXT_debug_utils                          = true,
+
+#ifdef ANV_USE_WSI_PLATFORM
+   .KHR_get_surface_capabilities2            = true,
+   .KHR_surface                              = true,
+   .KHR_surface_protected_capabilities       = true,
+#endif
+#ifdef VK_USE_PLATFORM_WAYLAND_KHR
+   .KHR_wayland_surface                      = true,
+#endif
+#ifdef VK_USE_PLATFORM_XCB_KHR
+   .KHR_xcb_surface                          = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_KHR
+   .KHR_xlib_surface                         = true,
+#endif
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   .EXT_acquire_xlib_display                 = true,
+#endif
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   .KHR_display                              = true,
+   .KHR_get_display_properties2              = true,
+   .EXT_direct_mode_display                  = true,
+   .EXT_display_surface_counter              = true,
+   .EXT_acquire_drm_display                  = true,
+#endif
+};
+
+static void
+get_device_extensions(const struct anv_physical_device *device,
+                      struct vk_device_extension_table *ext)
+{
+   const bool has_syncobj_wait =
+      (device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT) != 0;
+
+   const bool nv_mesh_shading_enabled =
+      env_var_as_boolean("ANV_EXPERIMENTAL_NV_MESH_SHADER", false);
+
+   *ext = (struct vk_device_extension_table) {
+      .KHR_8bit_storage                      = device->info.ver >= 8,
+      .KHR_16bit_storage                     = device->info.ver >= 8,
+      .KHR_bind_memory2                      = true,
+      .KHR_buffer_device_address             = device->has_a64_buffer_access,
+      .KHR_copy_commands2                    = true,
+      .KHR_create_renderpass2                = true,
+      .KHR_dedicated_allocation              = true,
+      .KHR_deferred_host_operations          = true,
+      .KHR_depth_stencil_resolve             = true,
+      .KHR_descriptor_update_template        = true,
+      .KHR_device_group                      = true,
+      .KHR_draw_indirect_count               = true,
+      .KHR_driver_properties                 = true,
+      .KHR_dynamic_rendering                 = true,
+      .KHR_external_fence                    = has_syncobj_wait,
+      .KHR_external_fence_fd                 = has_syncobj_wait,
+      .KHR_external_memory                   = true,
+      .KHR_external_memory_fd                = true,
+      .KHR_external_semaphore                = true,
+      .KHR_external_semaphore_fd             = true,
+      .KHR_format_feature_flags2             = true,
+      .KHR_fragment_shading_rate             = device->info.ver >= 11,
+      .KHR_get_memory_requirements2          = true,
+      .KHR_image_format_list                 = true,
+      .KHR_imageless_framebuffer             = true,
+#ifdef ANV_USE_WSI_PLATFORM
+      .KHR_incremental_present               = true,
+#endif
+      .KHR_maintenance1                      = true,
+      .KHR_maintenance2                      = true,
+      .KHR_maintenance3                      = true,
+      .KHR_maintenance4                      = true,
+      .KHR_multiview                         = true,
+      .KHR_performance_query =
+         !anv_use_relocations(device) && device->perf &&
+         (device->perf->i915_perf_version >= 3 ||
+          INTEL_DEBUG(DEBUG_NO_OACONFIG)) &&
+         device->use_call_secondary,
+      .KHR_pipeline_executable_properties    = true,
+      .KHR_push_descriptor                   = true,
+      .KHR_ray_query                         = device->info.has_ray_tracing,
+      .KHR_relaxed_block_layout              = true,
+      .KHR_sampler_mirror_clamp_to_edge      = true,
+      .KHR_sampler_ycbcr_conversion          = true,
+      .KHR_separate_depth_stencil_layouts    = true,
+      .KHR_shader_atomic_int64               = device->info.ver >= 9,
+      .KHR_shader_clock                      = true,
+      .KHR_shader_draw_parameters            = true,
+      .KHR_shader_float16_int8               = device->info.ver >= 8,
+      .KHR_shader_float_controls             = device->info.ver >= 8,
+      .KHR_shader_integer_dot_product        = true,
+      .KHR_shader_non_semantic_info          = true,
+      .KHR_shader_subgroup_extended_types    = device->info.ver >= 8,
+      .KHR_shader_subgroup_uniform_control_flow = true,
+      .KHR_shader_terminate_invocation       = true,
+      .KHR_spirv_1_4                         = true,
+      .KHR_storage_buffer_storage_class      = true,
+#ifdef ANV_USE_WSI_PLATFORM
+      .KHR_swapchain                         = true,
+      .KHR_swapchain_mutable_format          = true,
+#endif
+      .KHR_synchronization2                  = true,
+      .KHR_timeline_semaphore                = true,
+      .KHR_uniform_buffer_standard_layout    = true,
+      .KHR_variable_pointers                 = true,
+      .KHR_vulkan_memory_model               = true,
+      .KHR_workgroup_memory_explicit_layout  = true,
+      .KHR_zero_initialize_workgroup_memory  = true,
+      .EXT_4444_formats                      = true,
+      .EXT_border_color_swizzle              = device->info.ver >= 8,
+      .EXT_buffer_device_address             = device->has_a64_buffer_access,
+      .EXT_calibrated_timestamps             = device->has_reg_timestamp,
+      .EXT_color_write_enable                = true,
+      .EXT_conditional_rendering             = device->info.verx10 >= 75,
+      .EXT_conservative_rasterization        = device->info.ver >= 9,
+      .EXT_custom_border_color               = device->info.ver >= 8,
+      .EXT_depth_clip_control                = true,
+      .EXT_depth_clip_enable                 = true,
+      .EXT_descriptor_indexing               = device->has_a64_buffer_access &&
+                                               device->has_bindless_images,
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+      .EXT_display_control                   = true,
+#endif
+      .EXT_extended_dynamic_state            = true,
+      .EXT_extended_dynamic_state2           = true,
+      .EXT_external_memory_dma_buf           = true,
+      .EXT_external_memory_host              = true,
+      .EXT_fragment_shader_interlock         = device->info.ver >= 9,
+      .EXT_global_priority                   = device->max_context_priority >=
+                                               INTEL_CONTEXT_MEDIUM_PRIORITY,
+      .EXT_global_priority_query             = device->max_context_priority >=
+                                               INTEL_CONTEXT_MEDIUM_PRIORITY,
+      .EXT_host_query_reset                  = true,
+      .EXT_image_2d_view_of_3d               = true,
+      .EXT_image_robustness                  = true,
+      .EXT_image_drm_format_modifier         = true,
+      .EXT_image_view_min_lod                = true,
+      .EXT_index_type_uint8                  = true,
+      .EXT_inline_uniform_block              = true,
+      .EXT_line_rasterization                = true,
+      /* Enable the extension only if we have support on both the local &
+       * system memory
+       */
+      .EXT_memory_budget                     = (!device->info.has_local_mem ||
+                                                device->vram_mappable.available > 0) &&
+                                               device->sys.available,
+      .EXT_non_seamless_cube_map             = true,
+      .EXT_pci_bus_info                      = true,
+      .EXT_physical_device_drm               = true,
+      .EXT_pipeline_creation_cache_control   = true,
+      .EXT_pipeline_creation_feedback        = true,
+      .EXT_post_depth_coverage               = device->info.ver >= 9,
+      .EXT_primitives_generated_query        = true,
+      .EXT_primitive_topology_list_restart   = true,
+      .EXT_private_data                      = true,
+      .EXT_provoking_vertex                  = true,
+      .EXT_queue_family_foreign              = true,
+      .EXT_robustness2                       = true,
+      .EXT_sample_locations                  = true,
+      .EXT_sampler_filter_minmax             = device->info.ver >= 9,
+      .EXT_scalar_block_layout               = true,
+      .EXT_separate_stencil_usage            = true,
+      .EXT_shader_atomic_float               = true,
+      .EXT_shader_atomic_float2              = device->info.ver >= 9,
+      .EXT_shader_demote_to_helper_invocation = true,
+      .EXT_shader_module_identifier          = true,
+      .EXT_shader_stencil_export             = device->info.ver >= 9,
+      .EXT_shader_subgroup_ballot            = true,
+      .EXT_shader_subgroup_vote              = true,
+      .EXT_shader_viewport_index_layer       = true,
+      .EXT_subgroup_size_control             = true,
+      .EXT_texel_buffer_alignment            = true,
+      .EXT_tooling_info                      = true,
+      .EXT_transform_feedback                = true,
+      .EXT_vertex_attribute_divisor          = true,
+      .EXT_ycbcr_image_arrays                = true,
+#ifdef ANDROID
+      .ANDROID_external_memory_android_hardware_buffer = true,
+      .ANDROID_native_buffer                 = true,
+#endif
+      .GOOGLE_decorate_string                = true,
+      .GOOGLE_hlsl_functionality1            = true,
+      .GOOGLE_user_type                      = true,
+      .INTEL_performance_query               = device->perf &&
+                                               device->perf->i915_perf_version >= 3,
+      .INTEL_shader_integer_functions2       = device->info.ver >= 8,
+      .EXT_multi_draw                        = true,
+      .NV_compute_shader_derivatives         = true,
+      .NV_mesh_shader                        = device->info.has_mesh_shading &&
+                                               nv_mesh_shading_enabled,
+      .VALVE_mutable_descriptor_type         = true,
+   };
+}
+
+static uint64_t
+anv_compute_sys_heap_size(struct anv_physical_device *device,
+                          uint64_t total_ram)
+{
+   /* We don't want to burn too much ram with the GPU.  If the user has 4GiB
+    * or less, we use at most half.  If they have more than 4GiB, we use 3/4.
+    */
+   uint64_t available_ram;
+   if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull)
+      available_ram = total_ram / 2;
+   else
+      available_ram = total_ram * 3 / 4;
+
+   /* We also want to leave some padding for things we allocate in the driver,
+    * so don't go over 3/4 of the GTT either.
+    */
+   available_ram = MIN2(available_ram, device->gtt_size * 3 / 4);
+
+   if (available_ram > (2ull << 30) && !device->supports_48bit_addresses) {
+      /* When running with an overridden PCI ID, we may get a GTT size from
+       * the kernel that is greater than 2 GiB but the execbuf check for 48bit
+       * address support can still fail.  Just clamp the address space size to
+       * 2 GiB if we don't have 48-bit support.
+       */
+      mesa_logw("%s:%d: The kernel reported a GTT size larger than 2 GiB but "
+                "not support for 48-bit addresses",
+                __FILE__, __LINE__);
+      available_ram = 2ull << 30;
+   }
+
+   return available_ram;
+}
+
+static VkResult MUST_CHECK
+anv_init_meminfo(struct anv_physical_device *device, int fd)
+{
+   const struct intel_device_info *devinfo = &device->info;
+
+   device->sys.region.memory_class = devinfo->mem.sram.mem_class;
+   device->sys.region.memory_instance = devinfo->mem.sram.mem_instance;
+   device->sys.size =
+      anv_compute_sys_heap_size(device, devinfo->mem.sram.mappable.size);
+   device->sys.available = devinfo->mem.sram.mappable.free;
+
+   device->vram_mappable.region.memory_class = devinfo->mem.vram.mem_class;
+   device->vram_mappable.region.memory_instance =
+      devinfo->mem.vram.mem_instance;
+   device->vram_mappable.size = devinfo->mem.vram.mappable.size;
+   device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+
+   device->vram_non_mappable.region.memory_class =
+      devinfo->mem.vram.mem_class;
+   device->vram_non_mappable.region.memory_instance =
+      devinfo->mem.vram.mem_instance;
+   device->vram_non_mappable.size = devinfo->mem.vram.unmappable.size;
+   device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_update_meminfo(struct anv_physical_device *device, int fd)
+{
+   if (!intel_device_info_update_memory_info(&device->info, fd))
+      return;
+
+   const struct intel_device_info *devinfo = &device->info;
+   device->sys.available = devinfo->mem.sram.mappable.free;
+   device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+   device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+}
+
+
+static VkResult
+anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
+{
+   VkResult result = anv_init_meminfo(device, fd);
+   if (result != VK_SUCCESS)
+      return result;
+
+   assert(device->sys.size != 0);
+
+   if (anv_physical_device_has_vram(device)) {
+      /* We can create 2 or 3 different heaps when we have local memory
+       * support, first heap with local memory size and second with system
+       * memory size and the third is added only if part of the vram is
+       * mappable to the host.
+       */
+      device->memory.heap_count = 2;
+      device->memory.heaps[0] = (struct anv_memory_heap) {
+         /* If there is a vram_non_mappable, use that for the device only
+          * heap. Otherwise use the vram_mappable.
+          */
+         .size = device->vram_non_mappable.size != 0 ?
+                 device->vram_non_mappable.size : device->vram_mappable.size,
+         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+         .is_local_mem = true,
+      };
+      device->memory.heaps[1] = (struct anv_memory_heap) {
+         .size = device->sys.size,
+         .flags = 0,
+         .is_local_mem = false,
+      };
+      /* Add an additional smaller vram mappable heap if we can't map all the
+       * vram to the host.
+       */
+      if (device->vram_non_mappable.size > 0) {
+         device->memory.heap_count++;
+         device->memory.heaps[2] = (struct anv_memory_heap) {
+            .size = device->vram_mappable.size,
+            .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+            .is_local_mem = true,
+         };
+      }
+
+      device->memory.type_count = 3;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 1,
+      };
+      device->memory.types[2] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         /* This memory type either comes from heaps[0] if there is only
+          * mappable vram region, or from heaps[2] if there is both mappable &
+          * non-mappable vram regions.
+          */
+         .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0,
+      };
+   } else if (device->info.has_llc) {
+      device->memory.heap_count = 1;
+      device->memory.heaps[0] = (struct anv_memory_heap) {
+         .size = device->sys.size,
+         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+         .is_local_mem = false,
+      };
+
+      /* Big core GPUs share LLC with the CPU and thus one memory type can be
+       * both cached and coherent at the same time.
+       */
+      device->memory.type_count = 1;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   } else {
+      device->memory.heap_count = 1;
+      device->memory.heaps[0] = (struct anv_memory_heap) {
+         .size = device->sys.size,
+         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+         .is_local_mem = false,
+      };
+
+      /* The spec requires that we expose a host-visible, coherent memory
+       * type, but Atom GPUs don't share LLC. Thus we offer two memory types
+       * to give the application a choice between cached, but not coherent and
+       * coherent but uncached (WC though).
+       */
+      device->memory.type_count = 2;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+   }
+
+   device->memory.need_clflush = false;
+   for (unsigned i = 0; i < device->memory.type_count; i++) {
+      VkMemoryPropertyFlags props = device->memory.types[i].propertyFlags;
+      if ((props & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
+          !(props & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
+         device->memory.need_clflush = true;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_physical_device_init_uuids(struct anv_physical_device *device)
+{
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(anv_physical_device_init_uuids);
+   if (!note) {
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "Failed to find build-id");
+   }
+
+   unsigned build_id_len = build_id_length(note);
+   if (build_id_len < 20) {
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "build-id too short.  It needs to be a SHA");
+   }
+
+   memcpy(device->driver_build_sha1, build_id_data(note), 20);
+
+   struct mesa_sha1 sha1_ctx;
+   uint8_t sha1[20];
+   STATIC_ASSERT(VK_UUID_SIZE <= sizeof(sha1));
+
+   /* The pipeline cache UUID is used for determining when a pipeline cache is
+    * invalid.  It needs both a driver build and the PCI ID of the device.
+    */
+   _mesa_sha1_init(&sha1_ctx);
+   _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
+   _mesa_sha1_update(&sha1_ctx, &device->info.pci_device_id,
+                     sizeof(device->info.pci_device_id));
+   _mesa_sha1_update(&sha1_ctx, &device->always_use_bindless,
+                     sizeof(device->always_use_bindless));
+   _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access,
+                     sizeof(device->has_a64_buffer_access));
+   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images,
+                     sizeof(device->has_bindless_images));
+   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers,
+                     sizeof(device->has_bindless_samplers));
+   _mesa_sha1_final(&sha1_ctx, sha1);
+   memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
+
+   intel_uuid_compute_driver_id(device->driver_uuid, &device->info, VK_UUID_SIZE);
+   intel_uuid_compute_device_id(device->device_uuid, &device->info, VK_UUID_SIZE);
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_physical_device_init_disk_cache(struct anv_physical_device *device)
+{
+#ifdef ENABLE_SHADER_CACHE
+   char renderer[10];
+   ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x",
+                               device->info.pci_device_id);
+   assert(len == sizeof(renderer) - 2);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, device->driver_build_sha1);
+
+   const uint64_t driver_flags =
+      brw_get_compiler_config_value(device->compiler);
+   device->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
+
+static void
+anv_physical_device_free_disk_cache(struct anv_physical_device *device)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (device->vk.disk_cache) {
+      disk_cache_destroy(device->vk.disk_cache);
+      device->vk.disk_cache = NULL;
+   }
+#else
+   assert(device->vk.disk_cache == NULL);
+#endif
+}
+
+/* The ANV_QUEUE_OVERRIDE environment variable is a comma separated list of
+ * queue overrides.
+ *
+ * To override the number queues:
+ *  * "gc" is for graphics queues with compute support
+ *  * "g" is for graphics queues with no compute support
+ *  * "c" is for compute queues with no graphics support
+ *
+ * For example, ANV_QUEUE_OVERRIDE=gc=2,c=1 would override the number of
+ * advertised queues to be 2 queues with graphics+compute support, and 1 queue
+ * with compute-only support.
+ *
+ * ANV_QUEUE_OVERRIDE=c=1 would override the number of advertised queues to
+ * include 1 queue with compute-only support, but it will not change the
+ * number of graphics+compute queues.
+ *
+ * ANV_QUEUE_OVERRIDE=gc=0,c=1 would override the number of advertised queues
+ * to include 1 queue with compute-only support, and it would override the
+ * number of graphics+compute queues to be 0.
+ */
+static void
+anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
+{
+   int gc_override = -1;
+   int g_override = -1;
+   int c_override = -1;
+   char *env = getenv("ANV_QUEUE_OVERRIDE");
+
+   if (env == NULL)
+      return;
+
+   env = strdup(env);
+   char *save = NULL;
+   char *next = strtok_r(env, ",", &save);
+   while (next != NULL) {
+      if (strncmp(next, "gc=", 3) == 0) {
+         gc_override = strtol(next + 3, NULL, 0);
+      } else if (strncmp(next, "g=", 2) == 0) {
+         g_override = strtol(next + 2, NULL, 0);
+      } else if (strncmp(next, "c=", 2) == 0) {
+         c_override = strtol(next + 2, NULL, 0);
+      } else {
+         mesa_logw("Ignoring unsupported ANV_QUEUE_OVERRIDE token: %s", next);
+      }
+      next = strtok_r(NULL, ",", &save);
+   }
+   free(env);
+   if (gc_override >= 0)
+      *gc_count = gc_override;
+   if (g_override >= 0)
+      *g_count = g_override;
+   if (*g_count > 0 && *gc_count <= 0 && (gc_override >= 0 || g_override >= 0))
+      mesa_logw("ANV_QUEUE_OVERRIDE: gc=0 with g > 0 violates the "
+                "Vulkan specification");
+   if (c_override >= 0)
+      *c_count = c_override;
+}
+
+static void
+anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
+{
+   uint32_t family_count = 0;
+
+   if (pdevice->engine_info) {
+      int gc_count =
+         intel_gem_count_engines(pdevice->engine_info,
+                                 I915_ENGINE_CLASS_RENDER);
+      int g_count = 0;
+      int c_count = 0;
+      if (env_var_as_boolean("INTEL_COMPUTE_CLASS", false))
+         c_count = intel_gem_count_engines(pdevice->engine_info,
+                                           I915_ENGINE_CLASS_COMPUTE);
+      enum drm_i915_gem_engine_class compute_class =
+         c_count < 1 ? I915_ENGINE_CLASS_RENDER : I915_ENGINE_CLASS_COMPUTE;
+
+      anv_override_engine_counts(&gc_count, &g_count, &c_count);
+
+      if (gc_count > 0) {
+         pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+            .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+                          VK_QUEUE_COMPUTE_BIT |
+                          VK_QUEUE_TRANSFER_BIT,
+            .queueCount = gc_count,
+            .engine_class = I915_ENGINE_CLASS_RENDER,
+         };
+      }
+      if (g_count > 0) {
+         pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+            .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+                          VK_QUEUE_TRANSFER_BIT,
+            .queueCount = g_count,
+            .engine_class = I915_ENGINE_CLASS_RENDER,
+         };
+      }
+      if (c_count > 0) {
+         pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+            .queueFlags = VK_QUEUE_COMPUTE_BIT |
+                          VK_QUEUE_TRANSFER_BIT,
+            .queueCount = c_count,
+            .engine_class = compute_class,
+         };
+      }
+      /* Increase count below when other families are added as a reminder to
+       * increase the ANV_MAX_QUEUE_FAMILIES value.
+       */
+      STATIC_ASSERT(ANV_MAX_QUEUE_FAMILIES >= 3);
+   } else {
+      /* Default to a single render queue */
+      pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+         .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+                       VK_QUEUE_COMPUTE_BIT |
+                       VK_QUEUE_TRANSFER_BIT,
+         .queueCount = 1,
+         .engine_class = I915_ENGINE_CLASS_RENDER,
+      };
+      family_count = 1;
+   }
+   assert(family_count <= ANV_MAX_QUEUE_FAMILIES);
+   pdevice->queue.family_count = family_count;
+}
+
+static VkResult
+anv_physical_device_try_create(struct vk_instance *vk_instance,
+                               struct _drmDevice *drm_device,
+                               struct vk_physical_device **out)
+{
+   struct anv_instance *instance =
+      container_of(vk_instance, struct anv_instance, vk);
+
+   if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
+       drm_device->bustype != DRM_BUS_PCI ||
+       drm_device->deviceinfo.pci->vendor_id != 0x8086)
+      return VK_ERROR_INCOMPATIBLE_DRIVER;
+
+   const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY];
+   const char *path = drm_device->nodes[DRM_NODE_RENDER];
+   VkResult result;
+   int fd;
+   int master_fd = -1;
+
+   brw_process_intel_debug_variable();
+
+   fd = open(path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      if (errno == ENOMEM) {
+         return vk_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+                          "Unable to open device %s: out of memory", path);
+      }
+      return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                       "Unable to open device %s: %m", path);
+   }
+
+   struct intel_device_info devinfo;
+   if (!intel_get_device_info_from_fd(fd, &devinfo)) {
+      result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
+      goto fail_fd;
+   }
+
+   bool is_alpha = true;
+   if (devinfo.platform == INTEL_PLATFORM_HSW) {
+      mesa_logw("Haswell Vulkan support is incomplete");
+   } else if (devinfo.platform == INTEL_PLATFORM_IVB) {
+      mesa_logw("Ivy Bridge Vulkan support is incomplete");
+   } else if (devinfo.platform == INTEL_PLATFORM_BYT) {
+      mesa_logw("Bay Trail Vulkan support is incomplete");
+   } else if (devinfo.ver == 8) {
+      /* Gfx8 fully supported */
+      is_alpha = false;
+   } else {
+      result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                         "Vulkan not yet supported on %s", devinfo.name);
+      goto fail_fd;
+   }
+
+   struct anv_physical_device *device =
+      vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (device == NULL) {
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_fd;
+   }
+
+   struct vk_physical_device_dispatch_table dispatch_table;
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &anv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
+
+   result = vk_physical_device_init(&device->vk, &instance->vk,
+                                    NULL, /* We set up extensions later */
+                                    &dispatch_table);
+   if (result != VK_SUCCESS) {
+      vk_error(instance, result);
+      goto fail_alloc;
+   }
+   device->instance = instance;
+
+   assert(strlen(path) < ARRAY_SIZE(device->path));
+   snprintf(device->path, ARRAY_SIZE(device->path), "%s", path);
+
+   device->info = devinfo;
+   device->is_alpha = is_alpha;
+
+   device->cmd_parser_version = -1;
+   if (device->info.ver == 7) {
+      device->cmd_parser_version =
+         anv_gem_get_param(fd, I915_PARAM_CMD_PARSER_VERSION);
+      if (device->cmd_parser_version == -1) {
+         result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                            "failed to get command parser version");
+         goto fail_base;
+      }
+   }
+
+   if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing gem wait");
+      goto fail_base;
+   }
+
+   if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing execbuf2");
+      goto fail_base;
+   }
+
+   if (!device->info.has_llc &&
+       anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing wc mmap");
+      goto fail_base;
+   }
+
+   device->use_relocations = device->info.ver < 8 ||
+                             device->info.platform == INTEL_PLATFORM_CHV;
+
+   if (!device->use_relocations &&
+       !anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing softpin");
+      goto fail_alloc;
+   }
+
+   if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY)) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing syncobj support");
+      goto fail_base;
+   }
+
+   device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
+   device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
+
+   /* Start with medium; sorted low to high */
+   const int priorities[] = {
+      INTEL_CONTEXT_MEDIUM_PRIORITY,
+      INTEL_CONTEXT_HIGH_PRIORITY,
+      INTEL_CONTEXT_REALTIME_PRIORITY,
+   };
+   device->max_context_priority = INT_MIN;
+   for (unsigned i = 0; i < ARRAY_SIZE(priorities); i++) {
+      if (!anv_gem_has_context_priority(fd, priorities[i]))
+         break;
+      device->max_context_priority = priorities[i];
+   }
+
+   device->gtt_size = device->info.gtt_size ? device->info.gtt_size :
+                                              device->info.aperture_bytes;
+
+   /* We only allow 48-bit addresses with softpin because knowing the actual
+    * address is required for the vertex cache flush workaround.
+    */
+   device->supports_48bit_addresses = (device->info.ver >= 8) &&
+                                      device->gtt_size > (4ULL << 30 /* GiB */);
+
+   result = anv_physical_device_init_heaps(device, fd);
+   if (result != VK_SUCCESS)
+      goto fail_base;
+
+   assert(device->supports_48bit_addresses == !device->use_relocations);
+   device->use_softpin = !device->use_relocations;
+
+   device->has_context_isolation =
+      anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION);
+
+   device->has_exec_timeline =
+      anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES);
+   if (env_var_as_boolean("ANV_QUEUE_THREAD_DISABLE", false))
+      device->has_exec_timeline = false;
+
+   unsigned st_idx = 0;
+
+   device->sync_syncobj_type = vk_drm_syncobj_get_type(fd);
+   if (!device->has_exec_timeline)
+      device->sync_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+   device->sync_types[st_idx++] = &device->sync_syncobj_type;
+
+   if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT))
+      device->sync_types[st_idx++] = &anv_bo_sync_type;
+
+   if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) {
+      device->sync_timeline_type = vk_sync_timeline_get_type(&anv_bo_sync_type);
+      device->sync_types[st_idx++] = &device->sync_timeline_type.sync;
+   }
+
+   device->sync_types[st_idx++] = NULL;
+   assert(st_idx <= ARRAY_SIZE(device->sync_types));
+   device->vk.supported_sync_types = device->sync_types;
+
+   device->vk.pipeline_cache_import_ops = anv_cache_import_ops;
+
+   device->always_use_bindless =
+      env_var_as_boolean("ANV_ALWAYS_BINDLESS", false);
+
+   device->use_call_secondary =
+      device->use_softpin &&
+      !env_var_as_boolean("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
+
+   /* We first got the A64 messages on broadwell and we can only use them if
+    * we can pass addresses directly into the shader which requires softpin.
+    */
+   device->has_a64_buffer_access = device->info.ver >= 8 &&
+                                   device->use_softpin;
+
+   /* We first get bindless image access on Skylake.
+    */
+   device->has_bindless_images = device->info.ver >= 9;
+
+   /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms)
+    * because it's just a matter of setting the sampler address in the sample
+    * message header.  However, we've not bothered to wire it up for vec4 so
+    * we leave it disabled on gfx7.
+    */
+   device->has_bindless_samplers = device->info.ver >= 8;
+
+   device->has_implicit_ccs = device->info.has_aux_map ||
+                              device->info.verx10 >= 125;
+
+   /* Check if we can read the GPU timestamp register from the CPU */
+   uint64_t u64_ignore;
+   device->has_reg_timestamp = anv_gem_reg_read(fd, TIMESTAMP | I915_REG_READ_8B_WA,
+                                                &u64_ignore) == 0;
+
+   device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) ||
+      driQueryOptionb(&instance->dri_options, "always_flush_cache");
+
+   device->has_mmap_offset =
+      anv_gem_get_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
+
+   device->has_userptr_probe =
+      anv_gem_get_param(fd, I915_PARAM_HAS_USERPTR_PROBE);
+
+   device->compiler = brw_compiler_create(NULL, &device->info);
+   if (device->compiler == NULL) {
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_base;
+   }
+   device->compiler->shader_debug_log = compiler_debug_log;
+   device->compiler->shader_perf_log = compiler_perf_log;
+   device->compiler->constant_buffer_0_is_relative =
+      device->info.ver < 8 || !device->has_context_isolation;
+   device->compiler->supports_shader_constants = true;
+   device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
+
+   isl_device_init(&device->isl_dev, &device->info);
+
+   result = anv_physical_device_init_uuids(device);
+   if (result != VK_SUCCESS)
+      goto fail_compiler;
+
+   anv_physical_device_init_disk_cache(device);
+
+   if (instance->vk.enabled_extensions.KHR_display) {
+      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      if (master_fd >= 0) {
+         /* prod the device with a GETPARAM call which will fail if
+          * we don't have permission to even render on this device
+          */
+         if (anv_gem_get_param(master_fd, I915_PARAM_CHIPSET_ID) == 0) {
+            close(master_fd);
+            master_fd = -1;
+         }
+      }
+   }
+   device->master_fd = master_fd;
+
+   device->engine_info = anv_gem_get_engine_info(fd);
+   anv_physical_device_init_queue_families(device);
+
+   device->local_fd = fd;
+
+   anv_physical_device_init_perf(device, fd);
+
+   get_device_extensions(device, &device->vk.supported_extensions);
+
+   result = anv_init_wsi(device);
+   if (result != VK_SUCCESS)
+      goto fail_perf;
+
+   anv_measure_device_init(device);
+
+   anv_genX(&device->info, init_physical_device_state)(device);
+
+   *out = &device->vk;
+
+   struct stat st;
+
+   if (stat(primary_path, &st) == 0) {
+      device->has_master = true;
+      device->master_major = major(st.st_rdev);
+      device->master_minor = minor(st.st_rdev);
+   } else {
+      device->has_master = false;
+      device->master_major = 0;
+      device->master_minor = 0;
+   }
+
+   if (stat(path, &st) == 0) {
+      device->has_local = true;
+      device->local_major = major(st.st_rdev);
+      device->local_minor = minor(st.st_rdev);
+   } else {
+      device->has_local = false;
+      device->local_major = 0;
+      device->local_minor = 0;
+   }
+
+   return VK_SUCCESS;
+
+fail_perf:
+   ralloc_free(device->perf);
+   free(device->engine_info);
+   anv_physical_device_free_disk_cache(device);
+fail_compiler:
+   ralloc_free(device->compiler);
+fail_base:
+   vk_physical_device_finish(&device->vk);
+fail_alloc:
+   vk_free(&instance->vk.alloc, device);
+fail_fd:
+   close(fd);
+   if (master_fd != -1)
+      close(master_fd);
+   return result;
+}
+
+static void
+anv_physical_device_destroy(struct vk_physical_device *vk_device)
+{
+   struct anv_physical_device *device =
+      container_of(vk_device, struct anv_physical_device, vk);
+
+   anv_finish_wsi(device);
+   anv_measure_device_destroy(device);
+   free(device->engine_info);
+   anv_physical_device_free_disk_cache(device);
+   ralloc_free(device->compiler);
+   ralloc_free(device->perf);
+   close(device->local_fd);
+   if (device->master_fd >= 0)
+      close(device->master_fd);
+   vk_physical_device_finish(&device->vk);
+   vk_free(&device->instance->vk.alloc, device);
+}
+
+VkResult anv_EnumerateInstanceExtensionProperties(
+    const char*                                 pLayerName,
+    uint32_t*                                   pPropertyCount,
+    VkExtensionProperties*                      pProperties)
+{
+   if (pLayerName)
+      return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
+
+   return vk_enumerate_instance_extension_properties(
+      &instance_extensions, pPropertyCount, pProperties);
+}
+
+static void
+anv_init_dri_options(struct anv_instance *instance)
+{
+   driParseOptionInfo(&instance->available_dri_options, anv_dri_options,
+                      ARRAY_SIZE(anv_dri_options));
+   driParseConfigFiles(&instance->dri_options,
+                       &instance->available_dri_options, 0, "anv", NULL, NULL,
+                       instance->vk.app_info.app_name,
+                       instance->vk.app_info.app_version,
+                       instance->vk.app_info.engine_name,
+                       instance->vk.app_info.engine_version);
+
+    instance->assume_full_subgroups =
+            driQueryOptionb(&instance->dri_options, "anv_assume_full_subgroups");
+    instance->limit_trig_input_range =
+            driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
+    instance->sample_mask_out_opengl_behaviour =
+            driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
+}
+
+VkResult anv_CreateInstance(
+    const VkInstanceCreateInfo*                 pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkInstance*                                 pInstance)
+{
+   struct anv_instance *instance;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO);
+
+   if (pAllocator == NULL)
+      pAllocator = vk_default_allocator();
+
+   instance = vk_alloc(pAllocator, sizeof(*instance), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!instance)
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_instance_dispatch_table dispatch_table;
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &anv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
+
+   result = vk_instance_init(&instance->vk, &instance_extensions,
+                             &dispatch_table, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free(pAllocator, instance);
+      return vk_error(NULL, result);
+   }
+
+   instance->vk.physical_devices.try_create_for_drm = anv_physical_device_try_create;
+   instance->vk.physical_devices.destroy = anv_physical_device_destroy;
+
+   VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
+
+   anv_init_dri_options(instance);
+
+   intel_driver_ds_init();
+
+   *pInstance = anv_instance_to_handle(instance);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyInstance(
+    VkInstance                                  _instance,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+
+   if (!instance)
+      return;
+
+   VG(VALGRIND_DESTROY_MEMPOOL(instance));
+
+   driDestroyOptionCache(&instance->dri_options);
+   driDestroyOptionInfo(&instance->available_dri_options);
+
+   vk_instance_finish(&instance->vk);
+   vk_free(&instance->vk.alloc, instance);
+}
+
+void anv_GetPhysicalDeviceFeatures(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceFeatures*                   pFeatures)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   /* Just pick one; they're all the same */
+   const bool has_astc_ldr =
+      isl_format_supports_sampling(&pdevice->info,
+                                   ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16);
+
+   *pFeatures = (VkPhysicalDeviceFeatures) {
+      .robustBufferAccess                       = true,
+      .fullDrawIndexUint32                      = true,
+      .imageCubeArray                           = true,
+      .independentBlend                         = true,
+      .geometryShader                           = true,
+      .tessellationShader                       = true,
+      .sampleRateShading                        = true,
+      .dualSrcBlend                             = true,
+      .logicOp                                  = true,
+      .multiDrawIndirect                        = true,
+      .drawIndirectFirstInstance                = true,
+      .depthClamp                               = true,
+      .depthBiasClamp                           = true,
+      .fillModeNonSolid                         = true,
+      .depthBounds                              = pdevice->info.ver >= 12,
+      .wideLines                                = true,
+      .largePoints                              = true,
+      .alphaToOne                               = true,
+      .multiViewport                            = true,
+      .samplerAnisotropy                        = true,
+      .textureCompressionETC2                   = pdevice->info.ver >= 8 ||
+                                                  pdevice->info.platform == INTEL_PLATFORM_BYT,
+      .textureCompressionASTC_LDR               = has_astc_ldr,
+      .textureCompressionBC                     = true,
+      .occlusionQueryPrecise                    = true,
+      .pipelineStatisticsQuery                  = true,
+      .fragmentStoresAndAtomics                 = true,
+      .shaderTessellationAndGeometryPointSize   = true,
+      .shaderImageGatherExtended                = true,
+      .shaderStorageImageExtendedFormats        = true,
+      .shaderStorageImageMultisample            = false,
+      .shaderStorageImageReadWithoutFormat      = false,
+      .shaderStorageImageWriteWithoutFormat     = true,
+      .shaderUniformBufferArrayDynamicIndexing  = true,
+      .shaderSampledImageArrayDynamicIndexing   = true,
+      .shaderStorageBufferArrayDynamicIndexing  = true,
+      .shaderStorageImageArrayDynamicIndexing   = true,
+      .shaderClipDistance                       = true,
+      .shaderCullDistance                       = true,
+      .shaderFloat64                            = pdevice->info.ver >= 8 &&
+                                                  pdevice->info.has_64bit_float,
+      .shaderInt64                              = pdevice->info.ver >= 8,
+      .shaderInt16                              = pdevice->info.ver >= 8,
+      .shaderResourceMinLod                     = pdevice->info.ver >= 9,
+      .variableMultisampleRate                  = true,
+      .inheritedQueries                         = true,
+   };
+
+   /* We can't do image stores in vec4 shaders */
+   pFeatures->vertexPipelineStoresAndAtomics =
+      pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
+      pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+
+   struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
+
+   /* The new DOOM and Wolfenstein games require depthBounds without
+    * checking for it.  They seem to run fine without it so just claim it's
+    * there and accept the consequences.
+    */
+   if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
+      pFeatures->depthBounds = true;
+}
+
+static void
+anv_get_physical_device_features_1_1(struct anv_physical_device *pdevice,
+                                     VkPhysicalDeviceVulkan11Features *f)
+{
+   assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES);
+
+   f->storageBuffer16BitAccess            = pdevice->info.ver >= 8;
+   f->uniformAndStorageBuffer16BitAccess  = pdevice->info.ver >= 8;
+   f->storagePushConstant16               = pdevice->info.ver >= 8;
+   f->storageInputOutput16                = false;
+   f->multiview                           = true;
+   f->multiviewGeometryShader             = true;
+   f->multiviewTessellationShader         = true;
+   f->variablePointersStorageBuffer       = true;
+   f->variablePointers                    = true;
+   f->protectedMemory                     = false;
+   f->samplerYcbcrConversion              = true;
+   f->shaderDrawParameters                = true;
+}
+
+static void
+anv_get_physical_device_features_1_2(struct anv_physical_device *pdevice,
+                                     VkPhysicalDeviceVulkan12Features *f)
+{
+   assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES);
+
+   f->samplerMirrorClampToEdge            = true;
+   f->drawIndirectCount                   = true;
+   f->storageBuffer8BitAccess             = pdevice->info.ver >= 8;
+   f->uniformAndStorageBuffer8BitAccess   = pdevice->info.ver >= 8;
+   f->storagePushConstant8                = pdevice->info.ver >= 8;
+   f->shaderBufferInt64Atomics            = pdevice->info.ver >= 9;
+   f->shaderSharedInt64Atomics            = false;
+   f->shaderFloat16                       = pdevice->info.ver >= 8;
+   f->shaderInt8                          = pdevice->info.ver >= 8;
+
+   bool descIndexing = pdevice->has_a64_buffer_access &&
+                       pdevice->has_bindless_images;
+   f->descriptorIndexing                                 = descIndexing;
+   f->shaderInputAttachmentArrayDynamicIndexing          = false;
+   f->shaderUniformTexelBufferArrayDynamicIndexing       = descIndexing;
+   f->shaderStorageTexelBufferArrayDynamicIndexing       = descIndexing;
+   f->shaderUniformBufferArrayNonUniformIndexing         = false;
+   f->shaderSampledImageArrayNonUniformIndexing          = descIndexing;
+   f->shaderStorageBufferArrayNonUniformIndexing         = descIndexing;
+   f->shaderStorageImageArrayNonUniformIndexing          = descIndexing;
+   f->shaderInputAttachmentArrayNonUniformIndexing       = false;
+   f->shaderUniformTexelBufferArrayNonUniformIndexing    = descIndexing;
+   f->shaderStorageTexelBufferArrayNonUniformIndexing    = descIndexing;
+   f->descriptorBindingUniformBufferUpdateAfterBind      = descIndexing;
+   f->descriptorBindingSampledImageUpdateAfterBind       = descIndexing;
+   f->descriptorBindingStorageImageUpdateAfterBind       = descIndexing;
+   f->descriptorBindingStorageBufferUpdateAfterBind      = descIndexing;
+   f->descriptorBindingUniformTexelBufferUpdateAfterBind = descIndexing;
+   f->descriptorBindingStorageTexelBufferUpdateAfterBind = descIndexing;
+   f->descriptorBindingUpdateUnusedWhilePending          = descIndexing;
+   f->descriptorBindingPartiallyBound                    = descIndexing;
+   f->descriptorBindingVariableDescriptorCount           = descIndexing;
+   f->runtimeDescriptorArray                             = descIndexing;
+
+   f->samplerFilterMinmax                 = pdevice->info.ver >= 9;
+   f->scalarBlockLayout                   = true;
+   f->imagelessFramebuffer                = true;
+   f->uniformBufferStandardLayout         = true;
+   f->shaderSubgroupExtendedTypes         = true;
+   f->separateDepthStencilLayouts         = true;
+   f->hostQueryReset                      = true;
+   f->timelineSemaphore                   = true;
+   f->bufferDeviceAddress                 = pdevice->has_a64_buffer_access;
+   f->bufferDeviceAddressCaptureReplay    = pdevice->has_a64_buffer_access;
+   f->bufferDeviceAddressMultiDevice      = false;
+   f->vulkanMemoryModel                   = true;
+   f->vulkanMemoryModelDeviceScope        = true;
+   f->vulkanMemoryModelAvailabilityVisibilityChains = true;
+   f->shaderOutputViewportIndex           = true;
+   f->shaderOutputLayer                   = true;
+   f->subgroupBroadcastDynamicId          = true;
+}
+
+static void
+anv_get_physical_device_features_1_3(struct anv_physical_device *pdevice,
+                                     VkPhysicalDeviceVulkan13Features *f)
+{
+   assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES);
+
+   f->robustImageAccess = true;
+   f->inlineUniformBlock = true;
+   f->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
+   f->pipelineCreationCacheControl = true;
+   f->privateData = true;
+   f->shaderDemoteToHelperInvocation = true;
+   f->shaderTerminateInvocation = true;
+   f->subgroupSizeControl = true;
+   f->computeFullSubgroups = true;
+   f->synchronization2 = true;
+   f->textureCompressionASTC_HDR = false;
+   f->shaderZeroInitializeWorkgroupMemory = true;
+   f->dynamicRendering = true;
+   f->shaderIntegerDotProduct = true;
+   f->maintenance4 = true;
+}
+
+void anv_GetPhysicalDeviceFeatures2(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceFeatures2*                  pFeatures)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   anv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
+
+   VkPhysicalDeviceVulkan11Features core_1_1 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+   };
+   anv_get_physical_device_features_1_1(pdevice, &core_1_1);
+
+   VkPhysicalDeviceVulkan12Features core_1_2 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+   };
+   anv_get_physical_device_features_1_2(pdevice, &core_1_2);
+
+   VkPhysicalDeviceVulkan13Features core_1_3 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+   };
+   anv_get_physical_device_features_1_3(pdevice, &core_1_3);
+
+   vk_foreach_struct(ext, pFeatures->pNext) {
+      if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2))
+         continue;
+      if (vk_get_physical_device_core_1_3_feature_ext(ext, &core_1_3))
+         continue;
+
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: {
+         VkPhysicalDevice4444FormatsFeaturesEXT *features =
+            (VkPhysicalDevice4444FormatsFeaturesEXT *)ext;
+         features->formatA4R4G4B4 = true;
+         features->formatA4B4G4R4 = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: {
+         VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext;
+         features->accelerationStructure = false;
+         features->accelerationStructureCaptureReplay = false;
+         features->accelerationStructureIndirectBuild = false;
+         features->accelerationStructureHostCommands = false;
+         features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: {
+         VkPhysicalDeviceBufferDeviceAddressFeaturesEXT *features = (void *)ext;
+         features->bufferDeviceAddress = pdevice->has_a64_buffer_access;
+         features->bufferDeviceAddressCaptureReplay = false;
+         features->bufferDeviceAddressMultiDevice = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BORDER_COLOR_SWIZZLE_FEATURES_EXT: {
+         VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *features =
+            (VkPhysicalDeviceBorderColorSwizzleFeaturesEXT *)ext;
+         features->borderColorSwizzle = true;
+         features->borderColorSwizzleFromImage = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
+         VkPhysicalDeviceColorWriteEnableFeaturesEXT *features =
+            (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext;
+         features->colorWriteEnable = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_2D_VIEW_OF_3D_FEATURES_EXT: {
+         VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *features =
+            (VkPhysicalDeviceImage2DViewOf3DFeaturesEXT *)ext;
+         features->image2DViewOf3D = true;
+         features->sampler2DViewOf3D = pdevice->info.ver >= 9;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: {
+         VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features =
+            (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext;
+         features->computeDerivativeGroupQuads = true;
+         features->computeDerivativeGroupLinear = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
+         VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
+            (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
+         features->conditionalRendering = pdevice->info.verx10 >= 75;
+         features->inheritedConditionalRendering = pdevice->info.verx10 >= 75;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
+         VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
+            (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
+         features->customBorderColors = pdevice->info.ver >= 8;
+         features->customBorderColorWithoutFormat = pdevice->info.ver >= 8;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: {
+         VkPhysicalDeviceDepthClipEnableFeaturesEXT *features =
+            (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext;
+         features->depthClipEnable = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT: {
+         VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *features =
+            (VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *)ext;
+         features->fragmentShaderSampleInterlock = pdevice->info.ver >= 9;
+         features->fragmentShaderPixelInterlock = pdevice->info.ver >= 9;
+         features->fragmentShaderShadingRateInterlock = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GLOBAL_PRIORITY_QUERY_FEATURES_KHR: {
+         VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *features =
+            (VkPhysicalDeviceGlobalPriorityQueryFeaturesKHR *)ext;
+         features->globalPriorityQuery = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: {
+         VkPhysicalDeviceFragmentShadingRateFeaturesKHR *features =
+            (VkPhysicalDeviceFragmentShadingRateFeaturesKHR *)ext;
+         features->attachmentFragmentShadingRate = false;
+         features->pipelineFragmentShadingRate = true;
+         features->primitiveFragmentShadingRate =
+            pdevice->info.has_coarse_pixel_primitive_and_cb;
+         features->attachmentFragmentShadingRate =
+            pdevice->info.has_coarse_pixel_primitive_and_cb;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_MIN_LOD_FEATURES_EXT: {
+         VkPhysicalDeviceImageViewMinLodFeaturesEXT *features =
+            (VkPhysicalDeviceImageViewMinLodFeaturesEXT *)ext;
+         features->minLod = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
+         VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
+            (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
+         features->indexTypeUint8 = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
+         VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
+            (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
+         /* Rectangular lines must use the strict algorithm, which is not
+          * supported for wide lines prior to ICL.  See rasterization_mode for
+          * details and how the HW states are programmed.
+          */
+         features->rectangularLines = pdevice->info.ver >= 10;
+         features->bresenhamLines = true;
+         /* Support for Smooth lines with MSAA was removed on gfx11.  From the
+          * BSpec section "Multisample ModesState" table for "AA Line Support
+          * Requirements":
+          *
+          *    GFX10:BUG:######## 	NUM_MULTISAMPLES == 1
+          *
+          * Fortunately, this isn't a case most people care about.
+          */
+         features->smoothLines = pdevice->info.ver < 10;
+         features->stippledRectangularLines = false;
+         features->stippledBresenhamLines = true;
+         features->stippledSmoothLines = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_NV: {
+         VkPhysicalDeviceMeshShaderFeaturesNV *features =
+            (VkPhysicalDeviceMeshShaderFeaturesNV *)ext;
+         features->taskShader = pdevice->vk.supported_extensions.NV_mesh_shader;
+         features->meshShader = pdevice->vk.supported_extensions.NV_mesh_shader;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_VALVE: {
+         VkPhysicalDeviceMutableDescriptorTypeFeaturesVALVE *features =
+            (VkPhysicalDeviceMutableDescriptorTypeFeaturesVALVE *)ext;
+         features->mutableDescriptorType = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+         VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
+            (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
+         feature->performanceCounterQueryPools = true;
+         /* HW only supports a single configuration at a time. */
+         feature->performanceCounterMultipleQueryPools = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
+         VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
+            (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
+         features->pipelineExecutableInfo = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVES_GENERATED_QUERY_FEATURES_EXT: {
+         VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *features =
+            (VkPhysicalDevicePrimitivesGeneratedQueryFeaturesEXT *)ext;
+         features->primitivesGeneratedQuery = true;
+         features->primitivesGeneratedQueryWithRasterizerDiscard = false;
+         features->primitivesGeneratedQueryWithNonZeroStreams = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
+         VkPhysicalDeviceProvokingVertexFeaturesEXT *features =
+            (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext;
+         features->provokingVertexLast = true;
+         features->transformFeedbackPreservesProvokingVertex = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_QUERY_FEATURES_KHR: {
+         VkPhysicalDeviceRayQueryFeaturesKHR *features = (void *)ext;
+         features->rayQuery = pdevice->info.has_ray_tracing;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
+         VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
+         features->robustBufferAccess2 = true;
+         features->robustImageAccess2 = true;
+         features->nullDescriptor = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT: {
+         VkPhysicalDeviceShaderAtomicFloatFeaturesEXT *features = (void *)ext;
+         features->shaderBufferFloat32Atomics =    true;
+         features->shaderBufferFloat32AtomicAdd =  pdevice->info.has_lsc;
+         features->shaderBufferFloat64Atomics =
+            pdevice->info.has_64bit_float && pdevice->info.has_lsc;
+         features->shaderBufferFloat64AtomicAdd =  false;
+         features->shaderSharedFloat32Atomics =    true;
+         features->shaderSharedFloat32AtomicAdd =  false;
+         features->shaderSharedFloat64Atomics =    false;
+         features->shaderSharedFloat64AtomicAdd =  false;
+         features->shaderImageFloat32Atomics =     true;
+         features->shaderImageFloat32AtomicAdd =   false;
+         features->sparseImageFloat32Atomics =     false;
+         features->sparseImageFloat32AtomicAdd =   false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: {
+         VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features = (void *)ext;
+         features->shaderBufferFloat16Atomics      = false;
+         features->shaderBufferFloat16AtomicAdd    = false;
+         features->shaderBufferFloat16AtomicMinMax = false;
+         features->shaderBufferFloat32AtomicMinMax = pdevice->info.ver >= 9;
+         features->shaderBufferFloat64AtomicMinMax =
+            pdevice->info.has_64bit_float && pdevice->info.has_lsc;
+         features->shaderSharedFloat16Atomics      = false;
+         features->shaderSharedFloat16AtomicAdd    = false;
+         features->shaderSharedFloat16AtomicMinMax = false;
+         features->shaderSharedFloat32AtomicMinMax = pdevice->info.ver >= 9;
+         features->shaderSharedFloat64AtomicMinMax = false;
+         features->shaderImageFloat32AtomicMinMax  = false;
+         features->sparseImageFloat32AtomicMinMax  = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: {
+         VkPhysicalDeviceShaderClockFeaturesKHR *features =
+            (VkPhysicalDeviceShaderClockFeaturesKHR *)ext;
+         features->shaderSubgroupClock = true;
+         features->shaderDeviceClock = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: {
+         VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *features =
+            (VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *)ext;
+         features->shaderIntegerFunctions2 = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_FEATURES_EXT: {
+         VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *features =
+            (VkPhysicalDeviceShaderModuleIdentifierFeaturesEXT *)ext;
+         features->shaderModuleIdentifier = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW_FEATURES_KHR: {
+         VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *features =
+            (VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *)ext;
+         features->shaderSubgroupUniformControlFlow = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
+         VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features =
+            (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext;
+         features->texelBufferAlignment = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
+         VkPhysicalDeviceTransformFeedbackFeaturesEXT *features =
+            (VkPhysicalDeviceTransformFeedbackFeaturesEXT *)ext;
+         features->transformFeedback = true;
+         features->geometryStreams = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
+            (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
+         features->vertexAttributeInstanceRateDivisor = true;
+         features->vertexAttributeInstanceRateZeroDivisor = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR: {
+         VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *features =
+            (VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *)ext;
+         features->workgroupMemoryExplicitLayout = true;
+         features->workgroupMemoryExplicitLayoutScalarBlockLayout = true;
+         features->workgroupMemoryExplicitLayout8BitAccess = true;
+         features->workgroupMemoryExplicitLayout16BitAccess = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT: {
+         VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *features =
+            (VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *)ext;
+         features->ycbcrImageArrays = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
+         VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features =
+            (VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *)ext;
+         features->extendedDynamicState = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: {
+         VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features =
+            (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext;
+         features->extendedDynamicState2 = true;
+         features->extendedDynamicState2LogicOp = true;
+         features->extendedDynamicState2PatchControlPoints = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: {
+         VkPhysicalDeviceMultiDrawFeaturesEXT *features = (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext;
+         features->multiDraw = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_NON_SEAMLESS_CUBE_MAP_FEATURES_EXT : {
+         VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *features =
+            (VkPhysicalDeviceNonSeamlessCubeMapFeaturesEXT *)ext;
+         features->nonSeamlessCubeMap = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: {
+         VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *features =
+            (VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *)ext;
+         features->primitiveTopologyListRestart = true;
+         features->primitiveTopologyPatchListRestart = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT: {
+         VkPhysicalDeviceDepthClipControlFeaturesEXT *features =
+            (VkPhysicalDeviceDepthClipControlFeaturesEXT *)ext;
+         features->depthClipControl = true;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+}
+
+#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS   64
+
+#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
+#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS       256
+
+#define MAX_CUSTOM_BORDER_COLORS                   4096
+
+void anv_GetPhysicalDeviceProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceProperties*                 pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   const struct intel_device_info *devinfo = &pdevice->info;
+
+   const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64;
+   const uint32_t max_textures =
+      pdevice->has_bindless_images ? UINT16_MAX : 128;
+   const uint32_t max_samplers =
+      pdevice->has_bindless_samplers ? UINT16_MAX :
+      (devinfo->verx10 >= 75) ? 128 : 16;
+   const uint32_t max_images =
+      pdevice->has_bindless_images ? UINT16_MAX : MAX_IMAGES;
+
+   /* If we can use bindless for everything, claim a high per-stage limit,
+    * otherwise use the binding table size, minus the slots reserved for
+    * render targets and one slot for the descriptor buffer. */
+   const uint32_t max_per_stage =
+      pdevice->has_bindless_images && pdevice->has_a64_buffer_access
+      ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1;
+
+   const uint32_t max_workgroup_size =
+      MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
+
+   VkSampleCountFlags sample_counts =
+      isl_device_get_sample_counts(&pdevice->isl_dev);
+
+
+   VkPhysicalDeviceLimits limits = {
+      .maxImageDimension1D                      = (1 << 14),
+      .maxImageDimension2D                      = (1 << 14),
+      .maxImageDimension3D                      = (1 << 11),
+      .maxImageDimensionCube                    = (1 << 14),
+      .maxImageArrayLayers                      = (1 << 11),
+      .maxTexelBufferElements                   = 128 * 1024 * 1024,
+      .maxUniformBufferRange                    = pdevice->compiler->indirect_ubos_use_sampler ? (1u << 27) : (1u << 30),
+      .maxStorageBufferRange                    = pdevice->isl_dev.max_buffer_size,
+      .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
+      .maxMemoryAllocationCount                 = UINT32_MAX,
+      .maxSamplerAllocationCount                = 64 * 1024,
+      .bufferImageGranularity                   = 1,
+      .sparseAddressSpaceSize                   = 0,
+      .maxBoundDescriptorSets                   = MAX_SETS,
+      .maxPerStageDescriptorSamplers            = max_samplers,
+      .maxPerStageDescriptorUniformBuffers      = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
+      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
+      .maxPerStageDescriptorSampledImages       = max_textures,
+      .maxPerStageDescriptorStorageImages       = max_images,
+      .maxPerStageDescriptorInputAttachments    = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
+      .maxPerStageResources                     = max_per_stage,
+      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
+      .maxDescriptorSetUniformBuffers           = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,           /* number of stages * maxPerStageDescriptorUniformBuffers */
+      .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
+      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
+      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
+      .maxDescriptorSetInputAttachments         = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
+      .maxVertexInputAttributes                 = MAX_VES,
+      .maxVertexInputBindings                   = MAX_VBS,
+      /* Broadwell PRMs: Volume 2d: Command Reference: Structures:
+       *
+       * VERTEX_ELEMENT_STATE::Source Element Offset: [0,2047]
+       */
+      .maxVertexInputAttributeOffset            = 2047,
+      /* Broadwell PRMs: Volume 2d: Command Reference: Structures:
+       *
+       * VERTEX_BUFFER_STATE::Buffer Pitch: [0,2048]
+       *
+       * Skylake PRMs: Volume 2d: Command Reference: Structures:
+       *
+       * VERTEX_BUFFER_STATE::Buffer Pitch: [0,4095]
+       */
+      .maxVertexInputBindingStride              = devinfo->ver < 9 ? 2048 : 4095,
+      .maxVertexOutputComponents                = 128,
+      .maxTessellationGenerationLevel           = 64,
+      .maxTessellationPatchSize                 = 32,
+      .maxTessellationControlPerVertexInputComponents = 128,
+      .maxTessellationControlPerVertexOutputComponents = 128,
+      .maxTessellationControlPerPatchOutputComponents = 128,
+      .maxTessellationControlTotalOutputComponents = 2048,
+      .maxTessellationEvaluationInputComponents = 128,
+      .maxTessellationEvaluationOutputComponents = 128,
+      .maxGeometryShaderInvocations             = 32,
+      .maxGeometryInputComponents               = devinfo->ver >= 8 ? 128 : 64,
+      .maxGeometryOutputComponents              = 128,
+      .maxGeometryOutputVertices                = 256,
+      .maxGeometryTotalOutputComponents         = 1024,
+      .maxFragmentInputComponents               = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
+      .maxFragmentOutputAttachments             = 8,
+      .maxFragmentDualSrcAttachments            = 1,
+      .maxFragmentCombinedOutputResources       = MAX_RTS + max_ssbos + max_images,
+      .maxComputeSharedMemorySize               = 64 * 1024,
+      .maxComputeWorkGroupCount                 = { 65535, 65535, 65535 },
+      .maxComputeWorkGroupInvocations           = max_workgroup_size,
+      .maxComputeWorkGroupSize = {
+         max_workgroup_size,
+         max_workgroup_size,
+         max_workgroup_size,
+      },
+      .subPixelPrecisionBits                    = 8,
+      .subTexelPrecisionBits                    = 8,
+      .mipmapPrecisionBits                      = 8,
+      .maxDrawIndexedIndexValue                 = UINT32_MAX,
+      .maxDrawIndirectCount                     = UINT32_MAX,
+      .maxSamplerLodBias                        = 16,
+      .maxSamplerAnisotropy                     = 16,
+      .maxViewports                             = MAX_VIEWPORTS,
+      .maxViewportDimensions                    = { (1 << 14), (1 << 14) },
+      .viewportBoundsRange                      = { INT16_MIN, INT16_MAX },
+      .viewportSubPixelBits                     = 13, /* We take a float? */
+      .minMemoryMapAlignment                    = 4096, /* A page */
+      /* The dataport requires texel alignment so we need to assume a worst
+       * case of R32G32B32A32 which is 16 bytes.
+       */
+      .minTexelBufferOffsetAlignment            = 16,
+      .minUniformBufferOffsetAlignment          = ANV_UBO_ALIGNMENT,
+      .minStorageBufferOffsetAlignment          = ANV_SSBO_ALIGNMENT,
+      .minTexelOffset                           = -8,
+      .maxTexelOffset                           = 7,
+      .minTexelGatherOffset                     = -32,
+      .maxTexelGatherOffset                     = 31,
+      .minInterpolationOffset                   = -0.5,
+      .maxInterpolationOffset                   = 0.4375,
+      .subPixelInterpolationOffsetBits          = 4,
+      .maxFramebufferWidth                      = (1 << 14),
+      .maxFramebufferHeight                     = (1 << 14),
+      .maxFramebufferLayers                     = (1 << 11),
+      .framebufferColorSampleCounts             = sample_counts,
+      .framebufferDepthSampleCounts             = sample_counts,
+      .framebufferStencilSampleCounts           = sample_counts,
+      .framebufferNoAttachmentsSampleCounts     = sample_counts,
+      .maxColorAttachments                      = MAX_RTS,
+      .sampledImageColorSampleCounts            = sample_counts,
+      .sampledImageIntegerSampleCounts          = sample_counts,
+      .sampledImageDepthSampleCounts            = sample_counts,
+      .sampledImageStencilSampleCounts          = sample_counts,
+      .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
+      .maxSampleMaskWords                       = 1,
+      .timestampComputeAndGraphics              = true,
+      .timestampPeriod                          = 1000000000.0 / devinfo->timestamp_frequency,
+      .maxClipDistances                         = 8,
+      .maxCullDistances                         = 8,
+      .maxCombinedClipAndCullDistances          = 8,
+      .discreteQueuePriorities                  = 2,
+      .pointSizeRange                           = { 0.125, 255.875 },
+      /* While SKL and up support much wider lines than we are setting here,
+       * in practice we run into conformance issues if we go past this limit.
+       * Since the Windows driver does the same, it's probably fair to assume
+       * that no one needs more than this.
+       */
+      .lineWidthRange                           = { 0.0, devinfo->ver >= 9 ? 8.0 : 7.9921875 },
+      .pointSizeGranularity                     = (1.0 / 8.0),
+      .lineWidthGranularity                     = (1.0 / 128.0),
+      .strictLines                              = false,
+      .standardSampleLocations                  = true,
+      .optimalBufferCopyOffsetAlignment         = 128,
+      .optimalBufferCopyRowPitchAlignment       = 128,
+      .nonCoherentAtomSize                      = 64,
+   };
+
+   *pProperties = (VkPhysicalDeviceProperties) {
+      .apiVersion = ANV_API_VERSION,
+      .driverVersion = vk_get_driver_version(),
+      .vendorID = 0x8086,
+      .deviceID = pdevice->info.pci_device_id,
+      .deviceType = pdevice->info.has_local_mem ?
+                    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
+                    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+      .limits = limits,
+      .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
+   };
+
+   snprintf(pProperties->deviceName, sizeof(pProperties->deviceName),
+            "%s", pdevice->info.name);
+   memcpy(pProperties->pipelineCacheUUID,
+          pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
+}
+
+static void
+anv_get_physical_device_properties_1_1(struct anv_physical_device *pdevice,
+                                       VkPhysicalDeviceVulkan11Properties *p)
+{
+   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES);
+
+   memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+   memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+   memset(p->deviceLUID, 0, VK_LUID_SIZE);
+   p->deviceNodeMask = 0;
+   p->deviceLUIDValid = false;
+
+   p->subgroupSize = BRW_SUBGROUP_SIZE;
+   VkShaderStageFlags scalar_stages = 0;
+   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+      if (pdevice->compiler->scalar_stage[stage])
+         scalar_stages |= mesa_to_vk_shader_stage(stage);
+   }
+   if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
+      scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                       VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_MISS_BIT_KHR |
+                       VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                       VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+   }
+   if (pdevice->vk.supported_extensions.NV_mesh_shader) {
+      scalar_stages |= VK_SHADER_STAGE_TASK_BIT_NV |
+                       VK_SHADER_STAGE_MESH_BIT_NV;
+   }
+   p->subgroupSupportedStages = scalar_stages;
+   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                    VK_SUBGROUP_FEATURE_BALLOT_BIT |
+                                    VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+                                    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+                                    VK_SUBGROUP_FEATURE_QUAD_BIT;
+   if (pdevice->info.ver >= 8) {
+      /* TODO: There's no technical reason why these can't be made to
+       * work on gfx7 but they don't at the moment so it's best to leave
+       * the feature disabled than enabled and broken.
+       */
+      p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+                                        VK_SUBGROUP_FEATURE_CLUSTERED_BIT;
+   }
+   p->subgroupQuadOperationsInAllStages = pdevice->info.ver >= 8;
+
+   p->pointClippingBehavior      = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
+   p->maxMultiviewViewCount      = 16;
+   p->maxMultiviewInstanceIndex  = UINT32_MAX / 16;
+   p->protectedNoFault           = false;
+   /* This value doesn't matter for us today as our per-stage descriptors are
+    * the real limit.
+    */
+   p->maxPerSetDescriptors       = 1024;
+   p->maxMemoryAllocationSize    = MAX_MEMORY_ALLOCATION_SIZE;
+}
+
+static void
+anv_get_physical_device_properties_1_2(struct anv_physical_device *pdevice,
+                                       VkPhysicalDeviceVulkan12Properties *p)
+{
+   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES);
+
+   p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA;
+   memset(p->driverName, 0, sizeof(p->driverName));
+   snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE,
+            "Intel open-source Mesa driver");
+   memset(p->driverInfo, 0, sizeof(p->driverInfo));
+   snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+   /* Don't advertise conformance with a particular version if the hardware's
+    * support is incomplete/alpha.
+    */
+   if (pdevice->is_alpha) {
+      p->conformanceVersion = (VkConformanceVersion) {
+         .major = 0,
+         .minor = 0,
+         .subminor = 0,
+         .patch = 0,
+      };
+   }
+   else {
+      p->conformanceVersion = (VkConformanceVersion) {
+         .major = 1,
+         .minor = 3,
+         .subminor = 0,
+         .patch = 0,
+      };
+   }
+
+   p->denormBehaviorIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
+   p->roundingModeIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE;
+
+   /* Broadwell does not support HF denorms and there are restrictions
+    * other gens. According to Kabylake's PRM:
+    *
+    * "math - Extended Math Function
+    * [...]
+    * Restriction : Half-float denorms are always retained."
+    */
+   p->shaderDenormFlushToZeroFloat16         = false;
+   p->shaderDenormPreserveFloat16            = pdevice->info.ver > 8;
+   p->shaderRoundingModeRTEFloat16           = true;
+   p->shaderRoundingModeRTZFloat16           = true;
+   p->shaderSignedZeroInfNanPreserveFloat16  = true;
+
+   p->shaderDenormFlushToZeroFloat32         = true;
+   p->shaderDenormPreserveFloat32            = true;
+   p->shaderRoundingModeRTEFloat32           = true;
+   p->shaderRoundingModeRTZFloat32           = true;
+   p->shaderSignedZeroInfNanPreserveFloat32  = true;
+
+   p->shaderDenormFlushToZeroFloat64         = true;
+   p->shaderDenormPreserveFloat64            = true;
+   p->shaderRoundingModeRTEFloat64           = true;
+   p->shaderRoundingModeRTZFloat64           = true;
+   p->shaderSignedZeroInfNanPreserveFloat64  = true;
+
+   /* It's a bit hard to exactly map our implementation to the limits
+    * described by Vulkan.  The bindless surface handle in the extended
+    * message descriptors is 20 bits and it's an index into the table of
+    * RENDER_SURFACE_STATE structs that starts at bindless surface base
+    * address.  This means that we can have at must 1M surface states
+    * allocated at any given time.  Since most image views take two
+    * descriptors, this means we have a limit of about 500K image views.
+    *
+    * However, since we allocate surface states at vkCreateImageView time,
+    * this means our limit is actually something on the order of 500K image
+    * views allocated at any time.  The actual limit describe by Vulkan, on
+    * the other hand, is a limit of how many you can have in a descriptor set.
+    * Assuming anyone using 1M descriptors will be using the same image view
+    * twice a bunch of times (or a bunch of null descriptors), we can safely
+    * advertise a larger limit here.
+    */
+   const unsigned max_bindless_views = 1 << 20;
+   p->maxUpdateAfterBindDescriptorsInAllPools            = max_bindless_views;
+   p->shaderUniformBufferArrayNonUniformIndexingNative   = false;
+   p->shaderSampledImageArrayNonUniformIndexingNative    = false;
+   p->shaderStorageBufferArrayNonUniformIndexingNative   = true;
+   p->shaderStorageImageArrayNonUniformIndexingNative    = false;
+   p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
+   p->robustBufferAccessUpdateAfterBind                  = true;
+   p->quadDivergentImplicitLod                           = false;
+   p->maxPerStageDescriptorUpdateAfterBindSamplers       = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
+   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
+   p->maxPerStageUpdateAfterBindResources                = UINT32_MAX;
+   p->maxDescriptorSetUpdateAfterBindSamplers            = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = UINT32_MAX;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+   p->maxDescriptorSetUpdateAfterBindSampledImages       = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindStorageImages       = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindInputAttachments    = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
+
+   /* We support all of the depth resolve modes */
+   p->supportedDepthResolveModes    = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+                                      VK_RESOLVE_MODE_AVERAGE_BIT |
+                                      VK_RESOLVE_MODE_MIN_BIT |
+                                      VK_RESOLVE_MODE_MAX_BIT;
+   /* Average doesn't make sense for stencil so we don't support that */
+   p->supportedStencilResolveModes  = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
+   if (pdevice->info.ver >= 8) {
+      /* The advanced stencil resolve modes currently require stencil
+       * sampling be supported by the hardware.
+       */
+      p->supportedStencilResolveModes |= VK_RESOLVE_MODE_MIN_BIT |
+                                         VK_RESOLVE_MODE_MAX_BIT;
+   }
+   p->independentResolveNone  = true;
+   p->independentResolve      = true;
+
+   p->filterMinmaxSingleComponentFormats  = pdevice->info.ver >= 9;
+   p->filterMinmaxImageComponentMapping   = pdevice->info.ver >= 9;
+
+   p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
+
+   p->framebufferIntegerColorSampleCounts =
+      isl_device_get_sample_counts(&pdevice->isl_dev);
+}
+
+static void
+anv_get_physical_device_properties_1_3(struct anv_physical_device *pdevice,
+                                       VkPhysicalDeviceVulkan13Properties *p)
+{
+   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES);
+
+   p->minSubgroupSize = 8;
+   p->maxSubgroupSize = 32;
+   p->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
+   p->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT |
+                                   VK_SHADER_STAGE_TASK_BIT_NV |
+                                   VK_SHADER_STAGE_MESH_BIT_NV;
+
+   p->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
+   p->maxPerStageDescriptorInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxDescriptorSetInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxInlineUniformTotalSize = UINT16_MAX;
+
+   p->integerDotProduct8BitUnsignedAccelerated = false;
+   p->integerDotProduct8BitSignedAccelerated = false;
+   p->integerDotProduct8BitMixedSignednessAccelerated = false;
+   p->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct16BitUnsignedAccelerated = false;
+   p->integerDotProduct16BitSignedAccelerated = false;
+   p->integerDotProduct16BitMixedSignednessAccelerated = false;
+   p->integerDotProduct32BitUnsignedAccelerated = false;
+   p->integerDotProduct32BitSignedAccelerated = false;
+   p->integerDotProduct32BitMixedSignednessAccelerated = false;
+   p->integerDotProduct64BitUnsignedAccelerated = false;
+   p->integerDotProduct64BitSignedAccelerated = false;
+   p->integerDotProduct64BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
+
+   /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
+    * Base Address:
+    *
+    *    "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
+    *    specifies the base address of the first element of the surface,
+    *    computed in software by adding the surface base address to the
+    *    byte offset of the element in the buffer. The base address must
+    *    be aligned to element size."
+    *
+    * The typed dataport messages require that things be texel aligned.
+    * Otherwise, we may just load/store the wrong data or, in the worst
+    * case, there may be hangs.
+    */
+   p->storageTexelBufferOffsetAlignmentBytes = 16;
+   p->storageTexelBufferOffsetSingleTexelAlignment = true;
+
+   /* The sampler, however, is much more forgiving and it can handle
+    * arbitrary byte alignment for linear and buffer surfaces.  It's
+    * hard to find a good PRM citation for this but years of empirical
+    * experience demonstrate that this is true.
+    */
+   p->uniformTexelBufferOffsetAlignmentBytes = 1;
+   p->uniformTexelBufferOffsetSingleTexelAlignment = false;
+
+   p->maxBufferSize = pdevice->isl_dev.max_buffer_size;
+}
+
+void anv_GetPhysicalDeviceProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceProperties2*                pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   anv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
+
+   VkPhysicalDeviceVulkan11Properties core_1_1 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES,
+   };
+   anv_get_physical_device_properties_1_1(pdevice, &core_1_1);
+
+   VkPhysicalDeviceVulkan12Properties core_1_2 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES,
+   };
+   anv_get_physical_device_properties_1_2(pdevice, &core_1_2);
+
+   VkPhysicalDeviceVulkan13Properties core_1_3 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_PROPERTIES,
+   };
+   anv_get_physical_device_properties_1_3(pdevice, &core_1_3);
+
+   vk_foreach_struct(ext, pProperties->pNext) {
+      if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2))
+         continue;
+      if (vk_get_physical_device_core_1_3_property_ext(ext, &core_1_3))
+         continue;
+
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR: {
+         VkPhysicalDeviceAccelerationStructurePropertiesKHR *props = (void *)ext;
+         props->maxGeometryCount = (1u << 24) - 1;
+         props->maxInstanceCount = (1u << 24) - 1;
+         props->maxPrimitiveCount = (1u << 29) - 1;
+         props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
+         props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
+         props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
+         props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
+         props->minAccelerationStructureScratchOffsetAlignment = 64;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: {
+         /* TODO: Real limits */
+         VkPhysicalDeviceConservativeRasterizationPropertiesEXT *properties =
+            (VkPhysicalDeviceConservativeRasterizationPropertiesEXT *)ext;
+         /* There's nothing in the public docs about this value as far as I
+          * can tell.  However, this is the value the Windows driver reports
+          * and there's a comment on a rejected HW feature in the internal
+          * docs that says:
+          *
+          *    "This is similar to conservative rasterization, except the
+          *    primitive area is not extended by 1/512 and..."
+          *
+          * That's a bit of an obtuse reference but it's the best we've got
+          * for now.
+          */
+         properties->primitiveOverestimationSize = 1.0f / 512.0f;
+         properties->maxExtraPrimitiveOverestimationSize = 0.0f;
+         properties->extraPrimitiveOverestimationSizeGranularity = 0.0f;
+         properties->primitiveUnderestimation = false;
+         properties->conservativePointAndLineRasterization = false;
+         properties->degenerateTrianglesRasterized = true;
+         properties->degenerateLinesRasterized = false;
+         properties->fullyCoveredFragmentShaderInputVariable = false;
+         properties->conservativeRasterizationPostDepthCoverage = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceCustomBorderColorPropertiesEXT *properties =
+            (VkPhysicalDeviceCustomBorderColorPropertiesEXT *)ext;
+         properties->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR: {
+         VkPhysicalDeviceFragmentShadingRatePropertiesKHR *props =
+            (VkPhysicalDeviceFragmentShadingRatePropertiesKHR *)ext;
+         props->primitiveFragmentShadingRateWithMultipleViewports =
+            pdevice->info.has_coarse_pixel_primitive_and_cb;
+         props->layeredShadingRateAttachments = pdevice->info.has_coarse_pixel_primitive_and_cb;
+         props->fragmentShadingRateNonTrivialCombinerOps =
+            pdevice->info.has_coarse_pixel_primitive_and_cb;
+         props->maxFragmentSize = (VkExtent2D) { 4, 4 };
+         props->maxFragmentSizeAspectRatio =
+            pdevice->info.has_coarse_pixel_primitive_and_cb ?
+            2 : 4;
+         props->maxFragmentShadingRateCoverageSamples = 4 * 4 *
+            (pdevice->info.has_coarse_pixel_primitive_and_cb ? 4 : 16);
+         props->maxFragmentShadingRateRasterizationSamples =
+            pdevice->info.has_coarse_pixel_primitive_and_cb ?
+            VK_SAMPLE_COUNT_4_BIT :  VK_SAMPLE_COUNT_16_BIT;
+         props->fragmentShadingRateWithShaderDepthStencilWrites = false;
+         props->fragmentShadingRateWithSampleMask = true;
+         props->fragmentShadingRateWithShaderSampleMask = false;
+         props->fragmentShadingRateWithConservativeRasterization = true;
+         props->fragmentShadingRateWithFragmentShaderInterlock = true;
+         props->fragmentShadingRateWithCustomSampleLocations = true;
+
+         /* Fix in DG2_G10_C0 and DG2_G11_B0. Consider any other Sku as having
+          * the fix.
+          */
+         props->fragmentShadingRateStrictMultiplyCombiner =
+            pdevice->info.platform == INTEL_PLATFORM_DG2_G10 ?
+            pdevice->info.revision >= 8 :
+            pdevice->info.platform == INTEL_PLATFORM_DG2_G11 ?
+            pdevice->info.revision >= 4 : true;
+
+         if (pdevice->info.has_coarse_pixel_primitive_and_cb) {
+            props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+            props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+            props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 1;
+         } else {
+            /* Those must be 0 if attachmentFragmentShadingRate is not
+             * supported.
+             */
+            props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+            props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+            props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
+         }
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
+         VkPhysicalDeviceDrmPropertiesEXT *props =
+            (VkPhysicalDeviceDrmPropertiesEXT *)ext;
+
+         props->hasPrimary = pdevice->has_master;
+         props->primaryMajor = pdevice->master_major;
+         props->primaryMinor = pdevice->master_minor;
+
+         props->hasRender = pdevice->has_local;
+         props->renderMajor = pdevice->local_major;
+         props->renderMinor = pdevice->local_minor;
+
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT: {
+         VkPhysicalDeviceExternalMemoryHostPropertiesEXT *props =
+            (VkPhysicalDeviceExternalMemoryHostPropertiesEXT *) ext;
+         /* Userptr needs page aligned memory. */
+         props->minImportedHostPointerAlignment = 4096;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
+         VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
+            (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
+         /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond)
+          * Sampling Rules - Legacy Mode", it says the following:
+          *
+          *    "Note that the device divides a pixel into a 16x16 array of
+          *    subpixels, referenced by their upper left corners."
+          *
+          * This is the only known reference in the PRMs to the subpixel
+          * precision of line rasterization and a "16x16 array of subpixels"
+          * implies 4 subpixel precision bits.  Empirical testing has shown
+          * that 4 subpixel precision bits applies to all line rasterization
+          * types.
+          */
+         props->lineSubPixelPrecisionBits = 4;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_PROPERTIES: {
+         VkPhysicalDeviceMaintenance4Properties *properties =
+            (VkPhysicalDeviceMaintenance4Properties *)ext;
+         properties->maxBufferSize = pdevice->isl_dev.max_buffer_size;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_PROPERTIES_NV: {
+         VkPhysicalDeviceMeshShaderPropertiesNV *props =
+            (VkPhysicalDeviceMeshShaderPropertiesNV *)ext;
+
+         /* Bounded by the maximum representable size in
+          * 3DSTATE_MESH_SHADER_BODY::SharedLocalMemorySize.  Same for Task.
+          */
+         const uint32_t max_slm_size = 64 * 1024;
+
+         /* Bounded by the maximum representable size in
+          * 3DSTATE_MESH_SHADER_BODY::LocalXMaximum.  Same for Task.
+          */
+         const uint32_t max_workgroup_size = 1 << 10;
+
+         /* Bounded by the maximum representable count in
+          * 3DSTATE_MESH_SHADER_BODY::MaximumPrimitiveCount.
+          */
+         const uint32_t max_primitives = 1024;
+
+         /* TODO(mesh): Multiview. */
+         const uint32_t max_view_count = 1;
+
+         props->maxDrawMeshTasksCount = UINT32_MAX;
+
+         /* TODO(mesh): Implement workgroup Y and Z sizes larger than one by
+          * mapping them to/from the single value that HW provides us
+          * (currently used for X).
+          */
+
+         props->maxTaskWorkGroupInvocations = max_workgroup_size;
+         props->maxTaskWorkGroupSize[0] = max_workgroup_size;
+         props->maxTaskWorkGroupSize[1] = 1;
+         props->maxTaskWorkGroupSize[2] = 1;
+         props->maxTaskTotalMemorySize = max_slm_size;
+         props->maxTaskOutputCount = UINT16_MAX;
+
+         props->maxMeshWorkGroupInvocations = max_workgroup_size;
+         props->maxMeshWorkGroupSize[0] = max_workgroup_size;
+         props->maxMeshWorkGroupSize[1] = 1;
+         props->maxMeshWorkGroupSize[2] = 1;
+         props->maxMeshTotalMemorySize = max_slm_size / max_view_count;
+         props->maxMeshOutputPrimitives = max_primitives / max_view_count;
+         props->maxMeshMultiviewViewCount = max_view_count;
+
+         /* Depends on what indices can be represented with IndexFormat.  For
+          * now we always use U32, so bound to the maximum unique vertices we
+          * need for the maximum primitives.
+          *
+          * TODO(mesh): Revisit this if we drop "U32" IndexFormat when adding
+          * support for others.
+          */
+         props->maxMeshOutputVertices = 3 * props->maxMeshOutputPrimitives;
+
+
+         props->meshOutputPerVertexGranularity = 32;
+         props->meshOutputPerPrimitiveGranularity = 32;
+
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: {
+         VkPhysicalDevicePCIBusInfoPropertiesEXT *properties =
+            (VkPhysicalDevicePCIBusInfoPropertiesEXT *)ext;
+         properties->pciDomain = pdevice->info.pci_domain;
+         properties->pciBus = pdevice->info.pci_bus;
+         properties->pciDevice = pdevice->info.pci_dev;
+         properties->pciFunction = pdevice->info.pci_func;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
+         VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
+            (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+         /* We could support this by spawning a shader to do the equation
+          * normalization.
+          */
+         properties->allowCommandBufferQueryCopies = false;
+         break;
+      }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
+         VkPhysicalDevicePresentationPropertiesANDROID *props =
+            (VkPhysicalDevicePresentationPropertiesANDROID *)ext;
+         props->sharedImage = VK_FALSE;
+         break;
+      }
+#pragma GCC diagnostic pop
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
+         VkPhysicalDeviceProvokingVertexPropertiesEXT *properties =
+            (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
+         properties->provokingVertexModePerPipeline = true;
+         properties->transformFeedbackPreservesTriangleFanProvokingVertex = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
+         VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
+            (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext;
+         properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: {
+         VkPhysicalDeviceRobustness2PropertiesEXT *properties = (void *)ext;
+         properties->robustStorageBufferAccessSizeAlignment =
+            ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
+         properties->robustUniformBufferAccessSizeAlignment =
+            ANV_UBO_ALIGNMENT;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: {
+         VkPhysicalDeviceSampleLocationsPropertiesEXT *props =
+            (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext;
+
+         props->sampleLocationSampleCounts =
+            isl_device_get_sample_counts(&pdevice->isl_dev);
+
+         /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
+         props->maxSampleLocationGridSize.width = 1;
+         props->maxSampleLocationGridSize.height = 1;
+
+         props->sampleLocationCoordinateRange[0] = 0;
+         props->sampleLocationCoordinateRange[1] = 0.9375;
+         props->sampleLocationSubPixelBits = 4;
+
+         props->variableSampleLocations = true;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_MODULE_IDENTIFIER_PROPERTIES_EXT: {
+         VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *props =
+            (VkPhysicalDeviceShaderModuleIdentifierPropertiesEXT *)ext;
+         STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+                       sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+         memcpy(props->shaderModuleIdentifierAlgorithmUUID,
+                vk_shaderModuleIdentifierAlgorithmUUID,
+                sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
+         VkPhysicalDeviceTransformFeedbackPropertiesEXT *props =
+            (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
+
+         props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
+         props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
+         props->maxTransformFeedbackBufferSize = (1ull << 32);
+         props->maxTransformFeedbackStreamDataSize = 128 * 4;
+         props->maxTransformFeedbackBufferDataSize = 128 * 4;
+         props->maxTransformFeedbackBufferDataStride = 2048;
+         props->transformFeedbackQueries = true;
+         props->transformFeedbackStreamsLinesTriangles = false;
+         props->transformFeedbackRasterizationStreamSelect = false;
+         /* This requires MI_MATH */
+         props->transformFeedbackDraw = pdevice->info.verx10 >= 75;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
+            (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
+         /* We have to restrict this a bit for multiview */
+         props->maxVertexAttribDivisor = UINT32_MAX / 16;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
+         VkPhysicalDeviceMultiDrawPropertiesEXT *props = (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
+         props->maxMultiDrawCount = 2048;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+static int
+vk_priority_to_gen(int priority)
+{
+   switch (priority) {
+   case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+      return INTEL_CONTEXT_LOW_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+      return INTEL_CONTEXT_MEDIUM_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+      return INTEL_CONTEXT_HIGH_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
+      return INTEL_CONTEXT_REALTIME_PRIORITY;
+   default:
+      unreachable("Invalid priority");
+   }
+}
+
+static const VkQueueFamilyProperties
+anv_queue_family_properties_template = {
+   .timestampValidBits = 36, /* XXX: Real value here */
+   .minImageTransferGranularity = { 1, 1, 1 },
+};
+
+void anv_GetPhysicalDeviceQueueFamilyProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pQueueFamilyPropertyCount,
+    VkQueueFamilyProperties2*                   pQueueFamilyProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
+                          pQueueFamilyProperties, pQueueFamilyPropertyCount);
+
+   for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
+      struct anv_queue_family *queue_family = &pdevice->queue.families[i];
+      vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
+         p->queueFamilyProperties = anv_queue_family_properties_template;
+         p->queueFamilyProperties.queueFlags = queue_family->queueFlags;
+         p->queueFamilyProperties.queueCount = queue_family->queueCount;
+
+         vk_foreach_struct(ext, p->pNext) {
+            switch (ext->sType) {
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
+               VkQueueFamilyGlobalPriorityPropertiesKHR *properties =
+                  (VkQueueFamilyGlobalPriorityPropertiesKHR *)ext;
+
+               /* Deliberately sorted low to high */
+               VkQueueGlobalPriorityKHR all_priorities[] = {
+                  VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+               };
+
+               uint32_t count = 0;
+               for (unsigned i = 0; i < ARRAY_SIZE(all_priorities); i++) {
+                  if (vk_priority_to_gen(all_priorities[i]) >
+                      pdevice->max_context_priority)
+                     break;
+
+                  properties->priorities[count++] = all_priorities[i];
+               }
+               properties->priorityCount = count;
+               break;
+            }
+
+            default:
+               anv_debug_ignored_stype(ext->sType);
+            }
+         }
+      }
+   }
+}
+
+void anv_GetPhysicalDeviceMemoryProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceMemoryProperties*           pMemoryProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   pMemoryProperties->memoryTypeCount = physical_device->memory.type_count;
+   for (uint32_t i = 0; i < physical_device->memory.type_count; i++) {
+      pMemoryProperties->memoryTypes[i] = (VkMemoryType) {
+         .propertyFlags = physical_device->memory.types[i].propertyFlags,
+         .heapIndex     = physical_device->memory.types[i].heapIndex,
+      };
+   }
+
+   pMemoryProperties->memoryHeapCount = physical_device->memory.heap_count;
+   for (uint32_t i = 0; i < physical_device->memory.heap_count; i++) {
+      pMemoryProperties->memoryHeaps[i] = (VkMemoryHeap) {
+         .size    = physical_device->memory.heaps[i].size,
+         .flags   = physical_device->memory.heaps[i].flags,
+      };
+   }
+}
+
+static void
+anv_get_memory_budget(VkPhysicalDevice physicalDevice,
+                      VkPhysicalDeviceMemoryBudgetPropertiesEXT *memoryBudget)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+
+   if (!device->vk.supported_extensions.EXT_memory_budget)
+      return;
+
+   anv_update_meminfo(device, device->local_fd);
+
+   VkDeviceSize total_sys_heaps_size = 0, total_vram_heaps_size = 0;
+   for (size_t i = 0; i < device->memory.heap_count; i++) {
+      if (device->memory.heaps[i].is_local_mem) {
+         total_vram_heaps_size += device->memory.heaps[i].size;
+      } else {
+         total_sys_heaps_size += device->memory.heaps[i].size;
+      }
+   }
+
+   for (size_t i = 0; i < device->memory.heap_count; i++) {
+      VkDeviceSize heap_size = device->memory.heaps[i].size;
+      VkDeviceSize heap_used = device->memory.heaps[i].used;
+      VkDeviceSize heap_budget, total_heaps_size;
+      uint64_t mem_available = 0;
+
+      if (device->memory.heaps[i].is_local_mem) {
+         total_heaps_size = total_vram_heaps_size;
+         if (device->vram_non_mappable.size > 0 && i == 0) {
+            mem_available = device->vram_non_mappable.available;
+         } else {
+            mem_available = device->vram_mappable.available;
+         }
+      } else {
+         total_heaps_size = total_sys_heaps_size;
+         mem_available = device->sys.available;
+      }
+
+      double heap_proportion = (double) heap_size / total_heaps_size;
+      VkDeviceSize available_prop = mem_available * heap_proportion;
+
+      /*
+       * Let's not incite the app to starve the system: report at most 90% of
+       * the available heap memory.
+       */
+      uint64_t heap_available = available_prop * 9 / 10;
+      heap_budget = MIN2(heap_size, heap_used + heap_available);
+
+      /*
+       * Round down to the nearest MB
+       */
+      heap_budget &= ~((1ull << 20) - 1);
+
+      /*
+       * The heapBudget value must be non-zero for array elements less than
+       * VkPhysicalDeviceMemoryProperties::memoryHeapCount. The heapBudget
+       * value must be less than or equal to VkMemoryHeap::size for each heap.
+       */
+      assert(0 < heap_budget && heap_budget <= heap_size);
+
+      memoryBudget->heapUsage[i] = heap_used;
+      memoryBudget->heapBudget[i] = heap_budget;
+   }
+
+   /* The heapBudget and heapUsage values must be zero for array elements
+    * greater than or equal to VkPhysicalDeviceMemoryProperties::memoryHeapCount
+    */
+   for (uint32_t i = device->memory.heap_count; i < VK_MAX_MEMORY_HEAPS; i++) {
+      memoryBudget->heapBudget[i] = 0;
+      memoryBudget->heapUsage[i] = 0;
+   }
+}
+
+void anv_GetPhysicalDeviceMemoryProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    VkPhysicalDeviceMemoryProperties2*          pMemoryProperties)
+{
+   anv_GetPhysicalDeviceMemoryProperties(physicalDevice,
+                                         &pMemoryProperties->memoryProperties);
+
+   vk_foreach_struct(ext, pMemoryProperties->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT:
+         anv_get_memory_budget(physicalDevice, (void*)ext);
+         break;
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+void
+anv_GetDeviceGroupPeerMemoryFeatures(
+    VkDevice                                    device,
+    uint32_t                                    heapIndex,
+    uint32_t                                    localDeviceIndex,
+    uint32_t                                    remoteDeviceIndex,
+    VkPeerMemoryFeatureFlags*                   pPeerMemoryFeatures)
+{
+   assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
+   *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
+                          VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
+                          VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
+                          VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
+}
+
+PFN_vkVoidFunction anv_GetInstanceProcAddr(
+    VkInstance                                  _instance,
+    const char*                                 pName)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   return vk_instance_get_proc_addr(&instance->vk,
+                                    &anv_instance_entrypoints,
+                                    pName);
+}
+
+/* With version 1+ of the loader interface the ICD should expose
+ * vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps.
+ */
+PUBLIC
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+
+PUBLIC
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName)
+{
+   return anv_GetInstanceProcAddr(instance, pName);
+}
+
+/* With version 4+ of the loader interface the ICD should expose
+ * vk_icdGetPhysicalDeviceProcAddr()
+ */
+PUBLIC
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetPhysicalDeviceProcAddr(
+    VkInstance  _instance,
+    const char* pName);
+
+PFN_vkVoidFunction vk_icdGetPhysicalDeviceProcAddr(
+    VkInstance  _instance,
+    const char* pName)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
+}
+
+static struct anv_state
+anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p)
+{
+   struct anv_state state;
+
+   state = anv_state_pool_alloc(pool, size, align);
+   memcpy(state.map, p, size);
+
+   return state;
+}
+
+static void
+anv_device_init_border_colors(struct anv_device *device)
+{
+   if (device->info->platform == INTEL_PLATFORM_HSW) {
+      static const struct hsw_border_color border_colors[] = {
+         [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
+         [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
+         [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
+         [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
+         [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
+         [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
+      };
+
+      device->border_colors =
+         anv_state_pool_emit_data(&device->dynamic_state_pool,
+                                  sizeof(border_colors), 512, border_colors);
+   } else {
+      static const struct gfx8_border_color border_colors[] = {
+         [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
+         [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
+         [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
+         [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
+         [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
+         [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
+      };
+
+      device->border_colors =
+         anv_state_pool_emit_data(&device->dynamic_state_pool,
+                                  sizeof(border_colors), 64, border_colors);
+   }
+}
+
+static VkResult
+anv_device_init_trivial_batch(struct anv_device *device)
+{
+   VkResult result = anv_device_alloc_bo(device, "trivial-batch", 4096,
+                                         ANV_BO_ALLOC_MAPPED,
+                                         0 /* explicit_address */,
+                                         &device->trivial_batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct anv_batch batch = {
+      .start = device->trivial_batch_bo->map,
+      .next = device->trivial_batch_bo->map,
+      .end = device->trivial_batch_bo->map + 4096,
+   };
+
+   anv_batch_emit(&batch, GFX7_MI_BATCH_BUFFER_END, bbe);
+   anv_batch_emit(&batch, GFX7_MI_NOOP, noop);
+
+   if (device->physical->memory.need_clflush)
+      intel_clflush_range(batch.start, batch.next - batch.start);
+
+   return VK_SUCCESS;
+}
+
+static bool
+get_bo_from_pool(struct intel_batch_decode_bo *ret,
+                 struct anv_block_pool *pool,
+                 uint64_t address)
+{
+   anv_block_pool_foreach_bo(bo, pool) {
+      uint64_t bo_address = intel_48b_address(bo->offset);
+      if (address >= bo_address && address < (bo_address + bo->size)) {
+         *ret = (struct intel_batch_decode_bo) {
+            .addr = bo_address,
+            .size = bo->size,
+            .map = bo->map,
+         };
+         return true;
+      }
+   }
+   return false;
+}
+
+/* Finding a buffer for batch decoding */
+static struct intel_batch_decode_bo
+decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
+{
+   struct anv_device *device = v_batch;
+   struct intel_batch_decode_bo ret_bo = {};
+
+   assert(ppgtt);
+
+   if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
+      return ret_bo;
+   if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
+      return ret_bo;
+   if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
+      return ret_bo;
+   if (get_bo_from_pool(&ret_bo, &device->surface_state_pool.block_pool, address))
+      return ret_bo;
+
+   if (!device->cmd_buffer_being_decoded)
+      return (struct intel_batch_decode_bo) { };
+
+   struct anv_batch_bo **bo;
+
+   u_vector_foreach(bo, &device->cmd_buffer_being_decoded->seen_bbos) {
+      /* The decoder zeroes out the top 16 bits, so we need to as well */
+      uint64_t bo_address = (*bo)->bo->offset & (~0ull >> 16);
+
+      if (address >= bo_address && address < bo_address + (*bo)->bo->size) {
+         return (struct intel_batch_decode_bo) {
+            .addr = bo_address,
+            .size = (*bo)->bo->size,
+            .map = (*bo)->bo->map,
+         };
+      }
+   }
+
+   return (struct intel_batch_decode_bo) { };
+}
+
+struct intel_aux_map_buffer {
+   struct intel_buffer base;
+   struct anv_state state;
+};
+
+static struct intel_buffer *
+intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
+{
+   struct intel_aux_map_buffer *buf = malloc(sizeof(struct intel_aux_map_buffer));
+   if (!buf)
+      return NULL;
+
+   struct anv_device *device = (struct anv_device*)driver_ctx;
+   assert(device->physical->supports_48bit_addresses &&
+          device->physical->use_softpin);
+
+   struct anv_state_pool *pool = &device->dynamic_state_pool;
+   buf->state = anv_state_pool_alloc(pool, size, size);
+
+   buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset;
+   buf->base.gpu_end = buf->base.gpu + buf->state.alloc_size;
+   buf->base.map = buf->state.map;
+   buf->base.driver_bo = &buf->state;
+   return &buf->base;
+}
+
+static void
+intel_aux_map_buffer_free(void *driver_ctx, struct intel_buffer *buffer)
+{
+   struct intel_aux_map_buffer *buf = (struct intel_aux_map_buffer*)buffer;
+   struct anv_device *device = (struct anv_device*)driver_ctx;
+   struct anv_state_pool *pool = &device->dynamic_state_pool;
+   anv_state_pool_free(pool, buf->state);
+   free(buf);
+}
+
+static struct intel_mapped_pinned_buffer_alloc aux_map_allocator = {
+   .alloc = intel_aux_map_buffer_alloc,
+   .free = intel_aux_map_buffer_free,
+};
+
+static VkResult anv_device_check_status(struct vk_device *vk_device);
+
+static VkResult
+anv_device_setup_context(struct anv_device *device,
+                         const VkDeviceCreateInfo *pCreateInfo,
+                         const uint32_t num_queues)
+{
+   struct anv_physical_device *physical_device = device->physical;
+   VkResult result = VK_SUCCESS;
+
+   if (device->physical->engine_info) {
+      /* The kernel API supports at most 64 engines */
+      assert(num_queues <= 64);
+      uint16_t engine_classes[64];
+      int engine_count = 0;
+      for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+         const VkDeviceQueueCreateInfo *queueCreateInfo =
+            &pCreateInfo->pQueueCreateInfos[i];
+
+         assert(queueCreateInfo->queueFamilyIndex <
+                physical_device->queue.family_count);
+         struct anv_queue_family *queue_family =
+            &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
+
+         for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
+            engine_classes[engine_count++] = queue_family->engine_class;
+      }
+      device->context_id =
+         intel_gem_create_context_engines(device->fd,
+                                          physical_device->engine_info,
+                                          engine_count, engine_classes);
+   } else {
+      assert(num_queues == 1);
+      device->context_id = anv_gem_create_context(device);
+   }
+
+   if (device->context_id == -1) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      return result;
+   }
+
+   /* Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves.  In the case
+    * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
+    * the client clean up the pieces.
+    */
+   anv_gem_set_context_param(device->fd, device->context_id,
+                             I915_CONTEXT_PARAM_RECOVERABLE, false);
+
+   /* Check if client specified queue priority. */
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+      vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+   VkQueueGlobalPriorityKHR priority =
+      queue_priority ? queue_priority->globalPriority :
+         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   /* As per spec, the driver implementation may deny requests to acquire
+    * a priority above the default priority (MEDIUM) if the caller does not
+    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+    * is returned.
+    */
+   if (physical_device->max_context_priority >= INTEL_CONTEXT_MEDIUM_PRIORITY) {
+      int err = anv_gem_set_context_param(device->fd, device->context_id,
+                                          I915_CONTEXT_PARAM_PRIORITY,
+                                          vk_priority_to_gen(priority));
+      if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+         result = vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+         goto fail_context;
+      }
+   }
+
+   return result;
+
+fail_context:
+   anv_gem_destroy_context(device, device->context_id);
+   return result;
+}
+
+VkResult anv_CreateDevice(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDeviceCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VkResult result;
+   struct anv_device *device;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
+
+   /* Check enabled features */
+   bool robust_buffer_access = false;
+   if (pCreateInfo->pEnabledFeatures) {
+      if (pCreateInfo->pEnabledFeatures->robustBufferAccess)
+         robust_buffer_access = true;
+   }
+
+   vk_foreach_struct_const(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: {
+         const VkPhysicalDeviceFeatures2 *features = (const void *)ext;
+         if (features->features.robustBufferAccess)
+            robust_buffer_access = true;
+         break;
+      }
+
+      default:
+         /* Don't warn */
+         break;
+      }
+   }
+
+   /* Check requested queues and fail if we are requested to create any
+    * queues with flags we don't support.
+    */
+   assert(pCreateInfo->queueCreateInfoCount > 0);
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+      if (pCreateInfo->pQueueCreateInfos[i].flags != 0)
+         return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
+   }
+
+   device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
+                       sizeof(*device), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!device)
+      return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_device_dispatch_table dispatch_table;
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+      anv_genX(&physical_device->info, device_entrypoints), true);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+      &anv_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+      &wsi_device_entrypoints, false);
+
+   result = vk_device_init(&device->vk, &physical_device->vk,
+                           &dispatch_table, pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      const unsigned decode_flags =
+         INTEL_BATCH_DECODE_FULL |
+         (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+         INTEL_BATCH_DECODE_OFFSETS |
+         INTEL_BATCH_DECODE_FLOATS;
+
+      intel_batch_decode_ctx_init(&device->decoder_ctx,
+                                  &physical_device->compiler->isa,
+                                  &physical_device->info,
+                                  stderr, decode_flags, NULL,
+                                  decode_get_bo, NULL, device);
+
+      device->decoder_ctx.dynamic_base = DYNAMIC_STATE_POOL_MIN_ADDRESS;
+      device->decoder_ctx.surface_base = SURFACE_STATE_POOL_MIN_ADDRESS;
+      device->decoder_ctx.instruction_base =
+         INSTRUCTION_STATE_POOL_MIN_ADDRESS;
+   }
+
+   anv_device_set_physical(device, physical_device);
+
+   /* XXX(chadv): Can we dup() physicalDevice->fd here? */
+   device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
+   if (device->fd == -1) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_device;
+   }
+
+   device->vk.check_status = anv_device_check_status;
+   device->vk.create_sync_for_memory = anv_create_sync_for_memory;
+   vk_device_set_drm_fd(&device->vk, device->fd);
+
+   uint32_t num_queues = 0;
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
+      num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
+
+   result = anv_device_setup_context(device, pCreateInfo, num_queues);
+   if (result != VK_SUCCESS)
+      goto fail_fd;
+
+   device->queues =
+      vk_zalloc(&device->vk.alloc, num_queues * sizeof(*device->queues), 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (device->queues == NULL) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_context_id;
+   }
+
+   device->queue_count = 0;
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+      const VkDeviceQueueCreateInfo *queueCreateInfo =
+         &pCreateInfo->pQueueCreateInfos[i];
+
+      for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
+         /* When using legacy contexts, we use I915_EXEC_RENDER but, with
+          * engine-based contexts, the bottom 6 bits of exec_flags are used
+          * for the engine ID.
+          */
+         uint32_t exec_flags = device->physical->engine_info ?
+                               device->queue_count : I915_EXEC_RENDER;
+
+         result = anv_queue_init(device, &device->queues[device->queue_count],
+                                 exec_flags, queueCreateInfo, j);
+         if (result != VK_SUCCESS)
+            goto fail_queues;
+
+         device->queue_count++;
+      }
+   }
+
+   if (!anv_use_relocations(physical_device)) {
+      if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
+         result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+         goto fail_queues;
+      }
+
+      /* keep the page with address zero out of the allocator */
+      util_vma_heap_init(&device->vma_lo,
+                         LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+
+      util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS,
+                         CLIENT_VISIBLE_HEAP_SIZE);
+
+      /* Leave the last 4GiB out of the high vma range, so that no state
+       * base address + size can overflow 48 bits. For more information see
+       * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+       */
+      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
+                         physical_device->gtt_size - (1ull << 32) -
+                         HIGH_HEAP_MIN_ADDRESS);
+   }
+
+   list_inithead(&device->memory_objects);
+
+   /* On Broadwell and later, we can use batch chaining to more efficiently
+    * implement growing command buffers.  Prior to Haswell, the kernel
+    * command parser gets in the way and we have to fall back to growing
+    * the batch.
+    */
+   device->can_chain_batches = device->info->ver >= 8;
+
+   device->robust_buffer_access = robust_buffer_access;
+
+   if (pthread_mutex_init(&device->mutex, NULL) != 0) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_vmas;
+   }
+
+   pthread_condattr_t condattr;
+   if (pthread_condattr_init(&condattr) != 0) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_mutex;
+   }
+   if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
+      pthread_condattr_destroy(&condattr);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_mutex;
+   }
+   if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
+      pthread_condattr_destroy(&condattr);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_mutex;
+   }
+   pthread_condattr_destroy(&condattr);
+
+   result = anv_bo_cache_init(&device->bo_cache, device);
+   if (result != VK_SUCCESS)
+      goto fail_queue_cond;
+
+   anv_bo_pool_init(&device->batch_bo_pool, device, "batch");
+
+   /* Because scratch is also relative to General State Base Address, we leave
+    * the base address 0 and start the pool memory at an offset.  This way we
+    * get the correct offsets in the anv_states that get allocated from it.
+    */
+   result = anv_state_pool_init(&device->general_state_pool, device,
+                                "general pool",
+                                0, GENERAL_STATE_POOL_MIN_ADDRESS, 16384);
+   if (result != VK_SUCCESS)
+      goto fail_batch_bo_pool;
+
+   result = anv_state_pool_init(&device->dynamic_state_pool, device,
+                                "dynamic pool",
+                                DYNAMIC_STATE_POOL_MIN_ADDRESS, 0, 16384);
+   if (result != VK_SUCCESS)
+      goto fail_general_state_pool;
+
+   if (device->info->ver >= 8) {
+      /* The border color pointer is limited to 24 bits, so we need to make
+       * sure that any such color used at any point in the program doesn't
+       * exceed that limit.
+       * We achieve that by reserving all the custom border colors we support
+       * right off the bat, so they are close to the base address.
+       */
+      anv_state_reserved_pool_init(&device->custom_border_colors,
+                                   &device->dynamic_state_pool,
+                                   MAX_CUSTOM_BORDER_COLORS,
+                                   sizeof(struct gfx8_border_color), 64);
+   }
+
+   result = anv_state_pool_init(&device->instruction_state_pool, device,
+                                "instruction pool",
+                                INSTRUCTION_STATE_POOL_MIN_ADDRESS, 0, 16384);
+   if (result != VK_SUCCESS)
+      goto fail_dynamic_state_pool;
+
+   result = anv_state_pool_init(&device->surface_state_pool, device,
+                                "surface state pool",
+                                SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+   if (result != VK_SUCCESS)
+      goto fail_instruction_state_pool;
+
+   if (device->info->verx10 >= 125) {
+      /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding
+       * table its own base address separately from surface state base.
+       */
+      result = anv_state_pool_init(&device->binding_table_pool, device,
+                                   "binding table pool",
+                                   BINDING_TABLE_POOL_MIN_ADDRESS, 0,
+                                   BINDING_TABLE_POOL_BLOCK_SIZE);
+   } else if (!anv_use_relocations(physical_device)) {
+      int64_t bt_pool_offset = (int64_t)BINDING_TABLE_POOL_MIN_ADDRESS -
+                               (int64_t)SURFACE_STATE_POOL_MIN_ADDRESS;
+      assert(INT32_MIN < bt_pool_offset && bt_pool_offset < 0);
+      result = anv_state_pool_init(&device->binding_table_pool, device,
+                                   "binding table pool",
+                                   SURFACE_STATE_POOL_MIN_ADDRESS,
+                                   bt_pool_offset,
+                                   BINDING_TABLE_POOL_BLOCK_SIZE);
+   }
+   if (result != VK_SUCCESS)
+      goto fail_surface_state_pool;
+
+   if (device->info->has_aux_map) {
+      device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator,
+                                               &physical_device->info);
+      if (!device->aux_map_ctx)
+         goto fail_binding_table_pool;
+   }
+
+   result = anv_device_alloc_bo(device, "workaround", 4096,
+                                ANV_BO_ALLOC_CAPTURE |
+                                ANV_BO_ALLOC_MAPPED,
+                                0 /* explicit_address */,
+                                &device->workaround_bo);
+   if (result != VK_SUCCESS)
+      goto fail_surface_aux_map_pool;
+
+   device->workaround_address = (struct anv_address) {
+      .bo = device->workaround_bo,
+      .offset = align_u32(
+         intel_debug_write_identifiers(device->workaround_bo->map,
+                                       device->workaround_bo->size,
+                                       "Anv") + 8, 8),
+   };
+
+   device->debug_frame_desc =
+      intel_debug_get_identifier_block(device->workaround_bo->map,
+                                       device->workaround_bo->size,
+                                       INTEL_DEBUG_BLOCK_TYPE_FRAME);
+
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      uint32_t ray_queries_size =
+         align_u32(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+      result = anv_device_alloc_bo(device, "ray queries",
+                                   ray_queries_size,
+                                   0,
+                                   0 /* explicit_address */,
+                                   &device->ray_query_bo);
+      if (result != VK_SUCCESS)
+         goto fail_workaround_bo;
+   }
+
+   result = anv_device_init_trivial_batch(device);
+   if (result != VK_SUCCESS)
+      goto fail_ray_query_bo;
+
+   if (device->info->ver >= 12 &&
+       device->vk.enabled_extensions.KHR_fragment_shading_rate) {
+      uint32_t n_cps_states = 3 * 3; /* All combinaisons of X by Y CP sizes (1, 2, 4) */
+
+      if (device->info->has_coarse_pixel_primitive_and_cb)
+         n_cps_states *= 5 * 5; /* 5 combiners by 2 operators */
+
+      n_cps_states += 1; /* Disable CPS */
+
+       /* Each of the combinaison must be replicated on all viewports */
+      n_cps_states *= MAX_VIEWPORTS;
+
+      device->cps_states =
+         anv_state_pool_alloc(&device->dynamic_state_pool,
+                              n_cps_states * CPS_STATE_length(device->info) * 4,
+                              32);
+      if (device->cps_states.map == NULL)
+         goto fail_trivial_batch;
+
+      anv_genX(device->info, init_cps_device_state)(device);
+   }
+
+   /* Allocate a null surface state at surface state offset 0.  This makes
+    * NULL descriptor handling trivial because we can just memset structures
+    * to zero and they have a valid descriptor.
+    */
+   device->null_surface_state =
+      anv_state_pool_alloc(&device->surface_state_pool,
+                           device->isl_dev.ss.size,
+                           device->isl_dev.ss.align);
+   isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+                       .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
+   assert(device->null_surface_state.offset == 0);
+
+   anv_scratch_pool_init(device, &device->scratch_pool);
+
+   /* TODO(RT): Do we want some sort of data structure for this? */
+   memset(device->rt_scratch_bos, 0, sizeof(device->rt_scratch_bos));
+
+   result = anv_genX(device->info, init_device_state)(device);
+   if (result != VK_SUCCESS)
+      goto fail_trivial_batch_bo_and_scratch_pool;
+
+   struct vk_pipeline_cache_create_info pcc_info = { };
+   device->default_pipeline_cache =
+      vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+   if (!device->default_pipeline_cache) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_trivial_batch_bo_and_scratch_pool;
+   }
+
+   /* Internal shaders need their own pipeline cache because, unlike the rest
+    * of ANV, it won't work at all without the cache. It depends on it for
+    * shaders to remain resident while it runs. Therefore, we need a special
+    * cache just for BLORP/RT that's forced to always be enabled.
+    */
+   pcc_info.force_enable = true;
+   device->internal_cache =
+      vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+   if (device->internal_cache == NULL) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_default_pipeline_cache;
+   }
+
+   result = anv_device_init_rt_shaders(device);
+   if (result != VK_SUCCESS) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_internal_cache;
+   }
+
+   anv_device_init_blorp(device);
+
+   anv_device_init_border_colors(device);
+
+   anv_device_perf_init(device);
+
+   anv_device_utrace_init(device);
+
+   *pDevice = anv_device_to_handle(device);
+
+   return VK_SUCCESS;
+
+ fail_internal_cache:
+   vk_pipeline_cache_destroy(device->internal_cache, NULL);
+ fail_default_pipeline_cache:
+   vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+ fail_trivial_batch_bo_and_scratch_pool:
+   anv_scratch_pool_finish(device, &device->scratch_pool);
+ fail_trivial_batch:
+   anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+   if (device->ray_query_bo)
+      anv_device_release_bo(device, device->ray_query_bo);
+ fail_workaround_bo:
+   anv_device_release_bo(device, device->workaround_bo);
+ fail_surface_aux_map_pool:
+   if (device->info->has_aux_map) {
+      intel_aux_map_finish(device->aux_map_ctx);
+      device->aux_map_ctx = NULL;
+   }
+ fail_binding_table_pool:
+   if (!anv_use_relocations(physical_device))
+      anv_state_pool_finish(&device->binding_table_pool);
+ fail_surface_state_pool:
+   anv_state_pool_finish(&device->surface_state_pool);
+ fail_instruction_state_pool:
+   anv_state_pool_finish(&device->instruction_state_pool);
+ fail_dynamic_state_pool:
+   if (device->info->ver >= 8)
+      anv_state_reserved_pool_finish(&device->custom_border_colors);
+   anv_state_pool_finish(&device->dynamic_state_pool);
+ fail_general_state_pool:
+   anv_state_pool_finish(&device->general_state_pool);
+ fail_batch_bo_pool:
+   anv_bo_pool_finish(&device->batch_bo_pool);
+   anv_bo_cache_finish(&device->bo_cache);
+ fail_queue_cond:
+   pthread_cond_destroy(&device->queue_submit);
+ fail_mutex:
+   pthread_mutex_destroy(&device->mutex);
+ fail_vmas:
+   if (!anv_use_relocations(physical_device)) {
+      util_vma_heap_finish(&device->vma_hi);
+      util_vma_heap_finish(&device->vma_cva);
+      util_vma_heap_finish(&device->vma_lo);
+   }
+ fail_queues:
+   for (uint32_t i = 0; i < device->queue_count; i++)
+      anv_queue_finish(&device->queues[i]);
+   vk_free(&device->vk.alloc, device->queues);
+ fail_context_id:
+   anv_gem_destroy_context(device, device->context_id);
+ fail_fd:
+   close(device->fd);
+ fail_device:
+   vk_device_finish(&device->vk);
+ fail_alloc:
+   vk_free(&device->vk.alloc, device);
+
+   return result;
+}
+
+void anv_DestroyDevice(
+    VkDevice                                    _device,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (!device)
+      return;
+
+   anv_device_utrace_finish(device);
+
+   anv_device_finish_blorp(device);
+
+   anv_device_finish_rt_shaders(device);
+
+   vk_pipeline_cache_destroy(device->internal_cache, NULL);
+   vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+
+#ifdef HAVE_VALGRIND
+   /* We only need to free these to prevent valgrind errors.  The backing
+    * BO will go away in a couple of lines so we don't actually leak.
+    */
+   if (device->info->ver >= 8)
+      anv_state_reserved_pool_finish(&device->custom_border_colors);
+   anv_state_pool_free(&device->dynamic_state_pool, device->border_colors);
+   anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash);
+   anv_state_pool_free(&device->dynamic_state_pool, device->cps_states);
+#endif
+
+   for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) {
+      if (device->rt_scratch_bos[i] != NULL)
+         anv_device_release_bo(device, device->rt_scratch_bos[i]);
+   }
+
+   anv_scratch_pool_finish(device, &device->scratch_pool);
+
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
+         if (device->ray_query_shadow_bos[i] != NULL)
+            anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
+      }
+      anv_device_release_bo(device, device->ray_query_bo);
+   }
+   anv_device_release_bo(device, device->workaround_bo);
+   anv_device_release_bo(device, device->trivial_batch_bo);
+
+   if (device->info->has_aux_map) {
+      intel_aux_map_finish(device->aux_map_ctx);
+      device->aux_map_ctx = NULL;
+   }
+
+   if (!anv_use_relocations(device->physical))
+      anv_state_pool_finish(&device->binding_table_pool);
+   anv_state_pool_finish(&device->surface_state_pool);
+   anv_state_pool_finish(&device->instruction_state_pool);
+   anv_state_pool_finish(&device->dynamic_state_pool);
+   anv_state_pool_finish(&device->general_state_pool);
+
+   anv_bo_pool_finish(&device->batch_bo_pool);
+
+   anv_bo_cache_finish(&device->bo_cache);
+
+   if (!anv_use_relocations(device->physical)) {
+      util_vma_heap_finish(&device->vma_hi);
+      util_vma_heap_finish(&device->vma_cva);
+      util_vma_heap_finish(&device->vma_lo);
+   }
+
+   pthread_cond_destroy(&device->queue_submit);
+   pthread_mutex_destroy(&device->mutex);
+
+   for (uint32_t i = 0; i < device->queue_count; i++)
+      anv_queue_finish(&device->queues[i]);
+   vk_free(&device->vk.alloc, device->queues);
+
+   anv_gem_destroy_context(device, device->context_id);
+
+   if (INTEL_DEBUG(DEBUG_BATCH))
+      intel_batch_decode_ctx_finish(&device->decoder_ctx);
+
+   close(device->fd);
+
+   vk_device_finish(&device->vk);
+   vk_free(&device->vk.alloc, device);
+}
+
+VkResult anv_EnumerateInstanceLayerProperties(
+    uint32_t*                                   pPropertyCount,
+    VkLayerProperties*                          pProperties)
+{
+   if (pProperties == NULL) {
+      *pPropertyCount = 0;
+      return VK_SUCCESS;
+   }
+
+   /* None supported at this time */
+   return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
+}
+
+static VkResult
+anv_device_check_status(struct vk_device *vk_device)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+
+   uint32_t active, pending;
+   int ret = anv_gem_context_get_reset_stats(device->fd, device->context_id,
+                                             &active, &pending);
+   if (ret == -1) {
+      /* We don't know the real error. */
+      return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m");
+   }
+
+   if (active) {
+      return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers");
+   } else if (pending) {
+      return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                int64_t timeout)
+{
+   int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+   if (ret == -1 && errno == ETIME) {
+      return VK_TIMEOUT;
+   } else if (ret == -1) {
+      /* We don't know the real error. */
+      return vk_device_set_lost(&device->vk, "gem wait failed: %m");
+   } else {
+      return VK_SUCCESS;
+   }
+}
+
+uint64_t
+anv_vma_alloc(struct anv_device *device,
+              uint64_t size, uint64_t align,
+              enum anv_bo_alloc_flags alloc_flags,
+              uint64_t client_address)
+{
+   pthread_mutex_lock(&device->vma_mutex);
+
+   uint64_t addr = 0;
+
+   if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) {
+      if (client_address) {
+         if (util_vma_heap_alloc_addr(&device->vma_cva,
+                                      client_address, size)) {
+            addr = client_address;
+         }
+      } else {
+         addr = util_vma_heap_alloc(&device->vma_cva, size, align);
+      }
+      /* We don't want to fall back to other heaps */
+      goto done;
+   }
+
+   assert(client_address == 0);
+
+   if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS))
+      addr = util_vma_heap_alloc(&device->vma_hi, size, align);
+
+   if (addr == 0)
+      addr = util_vma_heap_alloc(&device->vma_lo, size, align);
+
+done:
+   pthread_mutex_unlock(&device->vma_mutex);
+
+   assert(addr == intel_48b_address(addr));
+   return intel_canonical_address(addr);
+}
+
+void
+anv_vma_free(struct anv_device *device,
+             uint64_t address, uint64_t size)
+{
+   const uint64_t addr_48b = intel_48b_address(address);
+
+   pthread_mutex_lock(&device->vma_mutex);
+
+   if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
+       addr_48b <= LOW_HEAP_MAX_ADDRESS) {
+      util_vma_heap_free(&device->vma_lo, addr_48b, size);
+   } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS &&
+              addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) {
+      util_vma_heap_free(&device->vma_cva, addr_48b, size);
+   } else {
+      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS);
+      util_vma_heap_free(&device->vma_hi, addr_48b, size);
+   }
+
+   pthread_mutex_unlock(&device->vma_mutex);
+}
+
+VkResult anv_AllocateMemory(
+    VkDevice                                    _device,
+    const VkMemoryAllocateInfo*                 pAllocateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDeviceMemory*                             pMem)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_physical_device *pdevice = device->physical;
+   struct anv_device_memory *mem;
+   VkResult result = VK_SUCCESS;
+
+   assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
+
+   /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
+   assert(pAllocateInfo->allocationSize > 0);
+
+   VkDeviceSize aligned_alloc_size =
+      align_u64(pAllocateInfo->allocationSize, 4096);
+
+   if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE)
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
+   struct anv_memory_type *mem_type =
+      &pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
+   assert(mem_type->heapIndex < pdevice->memory.heap_count);
+   struct anv_memory_heap *mem_heap =
+      &pdevice->memory.heaps[mem_type->heapIndex];
+
+   uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
+   if (mem_heap_used + aligned_alloc_size > mem_heap->size)
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem),
+                         VK_OBJECT_TYPE_DEVICE_MEMORY);
+   if (mem == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   mem->type = mem_type;
+   mem->map = NULL;
+   mem->map_size = 0;
+   mem->map_delta = 0;
+   mem->ahw = NULL;
+   mem->host_ptr = NULL;
+
+   enum anv_bo_alloc_flags alloc_flags = 0;
+
+   const VkExportMemoryAllocateInfo *export_info = NULL;
+   const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL;
+   const VkImportMemoryFdInfoKHR *fd_info = NULL;
+   const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL;
+   const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL;
+   VkMemoryAllocateFlags vk_flags = 0;
+   uint64_t client_address = 0;
+
+   vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
+         export_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID:
+         ahw_import_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
+         fd_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
+         host_ptr_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: {
+         const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext;
+         vk_flags = flags_info->flags;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
+         dedicated_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO: {
+         const VkMemoryOpaqueCaptureAddressAllocateInfo *addr_info =
+            (const VkMemoryOpaqueCaptureAddressAllocateInfo *)ext;
+         client_address = addr_info->opaqueCaptureAddress;
+         break;
+      }
+
+      default:
+         if (ext->sType != VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA)
+            /* this isn't a real enum value,
+             * so use conditional to avoid compiler warn
+             */
+            anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+   /* By default, we want all VkDeviceMemory objects to support CCS */
+   if (device->physical->has_implicit_ccs && device->info->has_aux_map)
+      alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS;
+
+   /* If i915 reported a mappable/non_mappable vram regions and the
+    * application want lmem mappable, then we need to use the
+    * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS flag to create our BO.
+    */
+   if (pdevice->vram_mappable.size > 0 &&
+       pdevice->vram_non_mappable.size > 0 &&
+       (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
+       (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
+      alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE;
+
+   if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)
+      alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+
+   if ((export_info && export_info->handleTypes) ||
+       (fd_info && fd_info->handleType) ||
+       (host_ptr_info && host_ptr_info->handleType)) {
+      /* Anything imported or exported is EXTERNAL */
+      alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
+   }
+
+   /* Check if we need to support Android HW buffer export. If so,
+    * create AHardwareBuffer and import memory from it.
+    */
+   bool android_export = false;
+   if (export_info && export_info->handleTypes &
+       VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)
+      android_export = true;
+
+   if (ahw_import_info) {
+      result = anv_import_ahw_memory(_device, mem, ahw_import_info);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      goto success;
+   } else if (android_export) {
+      result = anv_create_ahw_memory(_device, mem, pAllocateInfo);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      goto success;
+   }
+
+   /* The Vulkan spec permits handleType to be 0, in which case the struct is
+    * ignored.
+    */
+   if (fd_info && fd_info->handleType) {
+      /* At the moment, we support only the below handle types. */
+      assert(fd_info->handleType ==
+               VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+             fd_info->handleType ==
+               VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+
+      result = anv_device_import_bo(device, fd_info->fd, alloc_flags,
+                                    client_address, &mem->bo);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      /* For security purposes, we reject importing the bo if it's smaller
+       * than the requested allocation size.  This prevents a malicious client
+       * from passing a buffer to a trusted client, lying about the size, and
+       * telling the trusted client to try and texture from an image that goes
+       * out-of-bounds.  This sort of thing could lead to GPU hangs or worse
+       * in the trusted client.  The trusted client can protect itself against
+       * this sort of attack but only if it can trust the buffer size.
+       */
+      if (mem->bo->size < aligned_alloc_size) {
+         result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                            "aligned allocationSize too large for "
+                            "VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: "
+                            "%"PRIu64"B > %"PRIu64"B",
+                            aligned_alloc_size, mem->bo->size);
+         anv_device_release_bo(device, mem->bo);
+         goto fail;
+      }
+
+      /* From the Vulkan spec:
+       *
+       *    "Importing memory from a file descriptor transfers ownership of
+       *    the file descriptor from the application to the Vulkan
+       *    implementation. The application must not perform any operations on
+       *    the file descriptor after a successful import."
+       *
+       * If the import fails, we leave the file descriptor open.
+       */
+      close(fd_info->fd);
+      goto success;
+   }
+
+   if (host_ptr_info && host_ptr_info->handleType) {
+      if (host_ptr_info->handleType ==
+          VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) {
+         result = vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         goto fail;
+      }
+
+      assert(host_ptr_info->handleType ==
+             VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
+
+      result = anv_device_import_bo_from_host_ptr(device,
+                                                  host_ptr_info->pHostPointer,
+                                                  pAllocateInfo->allocationSize,
+                                                  alloc_flags,
+                                                  client_address,
+                                                  &mem->bo);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      mem->host_ptr = host_ptr_info->pHostPointer;
+      goto success;
+   }
+
+   /* Regular allocate (not importing memory). */
+
+   result = anv_device_alloc_bo(device, "user", pAllocateInfo->allocationSize,
+                                alloc_flags, client_address, &mem->bo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   if (dedicated_info && dedicated_info->image != VK_NULL_HANDLE) {
+      ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
+
+      /* Some legacy (non-modifiers) consumers need the tiling to be set on
+       * the BO.  In this case, we have a dedicated allocation.
+       */
+      if (image->vk.wsi_legacy_scanout) {
+         const struct isl_surf *surf = &image->planes[0].primary_surface.isl;
+         result = anv_device_set_bo_tiling(device, mem->bo,
+                                           surf->row_pitch_B,
+                                           surf->tiling);
+         if (result != VK_SUCCESS) {
+            anv_device_release_bo(device, mem->bo);
+            goto fail;
+         }
+      }
+   }
+
+ success:
+   mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
+   if (mem_heap_used > mem_heap->size) {
+      p_atomic_add(&mem_heap->used, -mem->bo->size);
+      anv_device_release_bo(device, mem->bo);
+      result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                         "Out of heap memory");
+      goto fail;
+   }
+
+   pthread_mutex_lock(&device->mutex);
+   list_addtail(&mem->link, &device->memory_objects);
+   pthread_mutex_unlock(&device->mutex);
+
+   *pMem = anv_device_memory_to_handle(mem);
+
+   return VK_SUCCESS;
+
+ fail:
+   vk_object_free(&device->vk, pAllocator, mem);
+
+   return result;
+}
+
+VkResult anv_GetMemoryFdKHR(
+    VkDevice                                    device_h,
+    const VkMemoryGetFdInfoKHR*                 pGetFdInfo,
+    int*                                        pFd)
+{
+   ANV_FROM_HANDLE(anv_device, dev, device_h);
+   ANV_FROM_HANDLE(anv_device_memory, mem, pGetFdInfo->memory);
+
+   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
+
+   assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
+          pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
+
+   return anv_device_export_bo(dev, mem->bo, pFd);
+}
+
+VkResult anv_GetMemoryFdPropertiesKHR(
+    VkDevice                                    _device,
+    VkExternalMemoryHandleTypeFlagBits          handleType,
+    int                                         fd,
+    VkMemoryFdPropertiesKHR*                    pMemoryFdProperties)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   switch (handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      /* dma-buf can be imported as any memory type */
+      pMemoryFdProperties->memoryTypeBits =
+         (1 << device->physical->memory.type_count) - 1;
+      return VK_SUCCESS;
+
+   default:
+      /* The valid usage section for this function says:
+       *
+       *    "handleType must not be one of the handle types defined as
+       *    opaque."
+       *
+       * So opaque handle types fall into the default "unsupported" case.
+       */
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+   }
+}
+
+VkResult anv_GetMemoryHostPointerPropertiesEXT(
+   VkDevice                                    _device,
+   VkExternalMemoryHandleTypeFlagBits          handleType,
+   const void*                                 pHostPointer,
+   VkMemoryHostPointerPropertiesEXT*           pMemoryHostPointerProperties)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(pMemoryHostPointerProperties->sType ==
+          VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT);
+
+   switch (handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT:
+      /* Host memory can be imported as any memory type. */
+      pMemoryHostPointerProperties->memoryTypeBits =
+         (1ull << device->physical->memory.type_count) - 1;
+
+      return VK_SUCCESS;
+
+   default:
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+   }
+}
+
+void anv_FreeMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _mem,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_device_memory, mem, _mem);
+
+   if (mem == NULL)
+      return;
+
+   pthread_mutex_lock(&device->mutex);
+   list_del(&mem->link);
+   pthread_mutex_unlock(&device->mutex);
+
+   if (mem->map)
+      anv_UnmapMemory(_device, _mem);
+
+   p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used,
+                -mem->bo->size);
+
+   anv_device_release_bo(device, mem->bo);
+
+#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
+   if (mem->ahw)
+      AHardwareBuffer_release(mem->ahw);
+#endif
+
+   vk_object_free(&device->vk, pAllocator, mem);
+}
+
+VkResult anv_MapMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _memory,
+    VkDeviceSize                                offset,
+    VkDeviceSize                                size,
+    VkMemoryMapFlags                            flags,
+    void**                                      ppData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+
+   if (mem == NULL) {
+      *ppData = NULL;
+      return VK_SUCCESS;
+   }
+
+   if (mem->host_ptr) {
+      *ppData = mem->host_ptr + offset;
+      return VK_SUCCESS;
+   }
+
+   /* From the Vulkan spec version 1.0.32 docs for MapMemory:
+    *
+    *  * memory must have been created with a memory type that reports
+    *    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+    */
+   if (!(mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object not mappable.");
+   }
+
+   if (size == VK_WHOLE_SIZE)
+      size = mem->bo->size - offset;
+
+   /* From the Vulkan spec version 1.0.32 docs for MapMemory:
+    *
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0
+    *    assert(size != 0);
+    *  * If size is not equal to VK_WHOLE_SIZE, size must be less than or
+    *    equal to the size of the memory minus offset
+    */
+   assert(size > 0);
+   assert(offset + size <= mem->bo->size);
+
+   if (size != (size_t)size) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "requested size 0x%"PRIx64" does not fit in %u bits",
+                       size, (unsigned)(sizeof(size_t) * 8));
+   }
+
+   /* From the Vulkan 1.2.194 spec:
+    *
+    *    "memory must not be currently host mapped"
+    */
+   if (mem->map != NULL) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object already mapped.");
+   }
+
+   uint32_t gem_flags = 0;
+
+   if (!device->info->has_llc &&
+       (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
+      gem_flags |= I915_MMAP_WC;
+
+   /* GEM will fail to map if the offset isn't 4k-aligned.  Round down. */
+   uint64_t map_offset;
+   if (!device->physical->has_mmap_offset)
+      map_offset = offset & ~4095ull;
+   else
+      map_offset = 0;
+   assert(offset >= map_offset);
+   uint64_t map_size = (offset + size) - map_offset;
+
+   /* Let's map whole pages */
+   map_size = align_u64(map_size, 4096);
+
+   void *map;
+   VkResult result = anv_device_map_bo(device, mem->bo, map_offset,
+                                       map_size, gem_flags, &map);
+   if (result != VK_SUCCESS)
+      return result;
+
+   mem->map = map;
+   mem->map_size = map_size;
+   mem->map_delta = (offset - map_offset);
+   *ppData = mem->map + mem->map_delta;
+
+   return VK_SUCCESS;
+}
+
+void anv_UnmapMemory(
+    VkDevice                                    _device,
+    VkDeviceMemory                              _memory)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+
+   if (mem == NULL || mem->host_ptr)
+      return;
+
+   anv_device_unmap_bo(device, mem->bo, mem->map, mem->map_size);
+
+   mem->map = NULL;
+   mem->map_size = 0;
+   mem->map_delta = 0;
+}
+
+VkResult anv_FlushMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (!device->physical->memory.need_clflush)
+      return VK_SUCCESS;
+
+   /* Make sure the writes we're flushing have landed. */
+   __builtin_ia32_mfence();
+
+   for (uint32_t i = 0; i < memoryRangeCount; i++) {
+      ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+      if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+         continue;
+
+      uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+      if (map_offset >= mem->map_size)
+         continue;
+
+      intel_clflush_range(mem->map + map_offset,
+                          MIN2(pMemoryRanges[i].size,
+                               mem->map_size - map_offset));
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_InvalidateMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (!device->physical->memory.need_clflush)
+      return VK_SUCCESS;
+
+   for (uint32_t i = 0; i < memoryRangeCount; i++) {
+      ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+      if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+         continue;
+
+      uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+      if (map_offset >= mem->map_size)
+         continue;
+
+      intel_invalidate_range(mem->map + map_offset,
+                             MIN2(pMemoryRanges[i].size,
+                                  mem->map_size - map_offset));
+   }
+
+   /* Make sure no reads get moved up above the invalidate. */
+   __builtin_ia32_mfence();
+
+   return VK_SUCCESS;
+}
+
+void anv_GetDeviceMemoryCommitment(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    VkDeviceSize*                               pCommittedMemoryInBytes)
+{
+   *pCommittedMemoryInBytes = 0;
+}
+
+static void
+anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, pBindInfo->memory);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pBindInfo->buffer);
+
+   assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO);
+
+   if (mem) {
+      assert(pBindInfo->memoryOffset < mem->bo->size);
+      assert(mem->bo->size - pBindInfo->memoryOffset >= buffer->vk.size);
+      buffer->address = (struct anv_address) {
+         .bo = mem->bo,
+         .offset = pBindInfo->memoryOffset,
+      };
+   } else {
+      buffer->address = ANV_NULL_ADDRESS;
+   }
+}
+
+VkResult anv_BindBufferMemory2(
+    VkDevice                                    device,
+    uint32_t                                    bindInfoCount,
+    const VkBindBufferMemoryInfo*               pBindInfos)
+{
+   for (uint32_t i = 0; i < bindInfoCount; i++)
+      anv_bind_buffer_memory(&pBindInfos[i]);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_QueueBindSparse(
+    VkQueue                                     _queue,
+    uint32_t                                    bindInfoCount,
+    const VkBindSparseInfo*                     pBindInfo,
+    VkFence                                     fence)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   if (vk_device_is_lost(&queue->device->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+// Event functions
+
+VkResult anv_CreateEvent(
+    VkDevice                                    _device,
+    const VkEventCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkEvent*                                    pEvent)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_event *event;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_EVENT_CREATE_INFO);
+
+   event = vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
+                           VK_OBJECT_TYPE_EVENT);
+   if (event == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   event->state = anv_state_pool_alloc(&device->dynamic_state_pool,
+                                       sizeof(uint64_t), 8);
+   *(uint64_t *)event->state.map = VK_EVENT_RESET;
+
+   *pEvent = anv_event_to_handle(event);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   if (!event)
+      return;
+
+   anv_state_pool_free(&device->dynamic_state_pool, event->state);
+
+   vk_object_free(&device->vk, pAllocator, event);
+}
+
+VkResult anv_GetEventStatus(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   if (vk_device_is_lost(&device->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   return *(uint64_t *)event->state.map;
+}
+
+VkResult anv_SetEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   *(uint64_t *)event->state.map = VK_EVENT_SET;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_ResetEvent(
+    VkDevice                                    _device,
+    VkEvent                                     _event)
+{
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   *(uint64_t *)event->state.map = VK_EVENT_RESET;
+
+   return VK_SUCCESS;
+}
+
+// Buffer functions
+
+static void
+anv_get_buffer_memory_requirements(struct anv_device *device,
+                                   VkDeviceSize size,
+                                   VkBufferUsageFlags usage,
+                                   VkMemoryRequirements2* pMemoryRequirements)
+{
+   /* The Vulkan spec (git aaed022) says:
+    *
+    *    memoryTypeBits is a bitfield and contains one bit set for every
+    *    supported memory type for the resource. The bit `1<<i` is set if and
+    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+    *    structure for the physical device is supported.
+    */
+   uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
+
+   /* Base alignment requirement of a cache line */
+   uint32_t alignment = 16;
+
+   if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)
+      alignment = MAX2(alignment, ANV_UBO_ALIGNMENT);
+
+   pMemoryRequirements->memoryRequirements.size = size;
+   pMemoryRequirements->memoryRequirements.alignment = alignment;
+
+   /* Storage and Uniform buffers should have their size aligned to
+    * 32-bits to avoid boundary checks when last DWord is not complete.
+    * This would ensure that not internal padding would be needed for
+    * 16-bit types.
+    */
+   if (device->robust_buffer_access &&
+       (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
+        usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
+      pMemoryRequirements->memoryRequirements.size = align_u64(size, 4);
+
+   pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
+
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *requirements = (void *)ext;
+         requirements->prefersDedicatedAllocation = false;
+         requirements->requiresDedicatedAllocation = false;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+void anv_GetBufferMemoryRequirements2(
+    VkDevice                                    _device,
+    const VkBufferMemoryRequirementsInfo2*      pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+   anv_get_buffer_memory_requirements(device,
+                                      buffer->vk.size,
+                                      buffer->vk.usage,
+                                      pMemoryRequirements);
+}
+
+void anv_GetDeviceBufferMemoryRequirementsKHR(
+    VkDevice                                    _device,
+    const VkDeviceBufferMemoryRequirements*     pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   anv_get_buffer_memory_requirements(device,
+                                      pInfo->pCreateInfo->size,
+                                      pInfo->pCreateInfo->usage,
+                                      pMemoryRequirements);
+}
+
+VkResult anv_CreateBuffer(
+    VkDevice                                    _device,
+    const VkBufferCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBuffer*                                   pBuffer)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_buffer *buffer;
+
+   /* Don't allow creating buffers bigger than our address space.  The real
+    * issue here is that we may align up the buffer size and we don't want
+    * doing so to cause roll-over.  However, no one has any business
+    * allocating a buffer larger than our GTT size.
+    */
+   if (pCreateInfo->size > device->physical->gtt_size)
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   buffer = vk_buffer_create(&device->vk, pCreateInfo,
+                             pAllocator, sizeof(*buffer));
+   if (buffer == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   buffer->address = ANV_NULL_ADDRESS;
+
+   *pBuffer = anv_buffer_to_handle(buffer);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroyBuffer(
+    VkDevice                                    _device,
+    VkBuffer                                    _buffer,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   if (!buffer)
+      return;
+
+   vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
+}
+
+VkDeviceAddress anv_GetBufferDeviceAddress(
+    VkDevice                                    device,
+    const VkBufferDeviceAddressInfo*            pInfo)
+{
+   ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+   assert(!anv_address_is_null(buffer->address));
+   assert(anv_bo_is_pinned(buffer->address.bo));
+
+   return anv_address_physical(buffer->address);
+}
+
+uint64_t anv_GetBufferOpaqueCaptureAddress(
+    VkDevice                                    device,
+    const VkBufferDeviceAddressInfo*            pInfo)
+{
+   return 0;
+}
+
+uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress(
+    VkDevice                                    device,
+    const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory);
+
+   assert(anv_bo_is_pinned(memory->bo));
+   assert(memory->bo->has_client_visible_address);
+
+   return intel_48b_address(memory->bo->offset);
+}
+
+void
+anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
+                              enum isl_format format,
+                              struct isl_swizzle swizzle,
+                              isl_surf_usage_flags_t usage,
+                              struct anv_address address,
+                              uint32_t range, uint32_t stride)
+{
+   isl_buffer_fill_state(&device->isl_dev, state.map,
+                         .address = anv_address_physical(address),
+                         .mocs = isl_mocs(&device->isl_dev, usage,
+                                          address.bo && address.bo->is_external),
+                         .size_B = range,
+                         .format = format,
+                         .swizzle = swizzle,
+                         .stride_B = stride);
+}
+
+void anv_DestroySampler(
+    VkDevice                                    _device,
+    VkSampler                                   _sampler,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_sampler, sampler, _sampler);
+
+   if (!sampler)
+      return;
+
+   if (sampler->bindless_state.map) {
+      anv_state_pool_free(&device->dynamic_state_pool,
+                          sampler->bindless_state);
+   }
+
+   if (sampler->custom_border_color.map) {
+      anv_state_reserved_pool_free(&device->custom_border_colors,
+                                   sampler->custom_border_color);
+   }
+
+   vk_object_free(&device->vk, pAllocator, sampler);
+}
+
+static const VkTimeDomainEXT anv_time_domains[] = {
+   VK_TIME_DOMAIN_DEVICE_EXT,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
+#ifdef CLOCK_MONOTONIC_RAW
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
+#endif
+};
+
+VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
+   VkPhysicalDevice                             physicalDevice,
+   uint32_t                                     *pTimeDomainCount,
+   VkTimeDomainEXT                              *pTimeDomains)
+{
+   int d;
+   VK_OUTARRAY_MAKE_TYPED(VkTimeDomainEXT, out, pTimeDomains, pTimeDomainCount);
+
+   for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
+      vk_outarray_append_typed(VkTimeDomainEXT, &out, i) {
+         *i = anv_time_domains[d];
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static uint64_t
+anv_clock_gettime(clockid_t clock_id)
+{
+   struct timespec current;
+   int ret;
+
+   ret = clock_gettime(clock_id, &current);
+#ifdef CLOCK_MONOTONIC_RAW
+   if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
+      ret = clock_gettime(CLOCK_MONOTONIC, &current);
+#endif
+   if (ret < 0)
+      return 0;
+
+   return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
+}
+
+VkResult anv_GetCalibratedTimestampsEXT(
+   VkDevice                                     _device,
+   uint32_t                                     timestampCount,
+   const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
+   uint64_t                                     *pTimestamps,
+   uint64_t                                     *pMaxDeviation)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   uint64_t timestamp_frequency = device->info->timestamp_frequency;
+   int  ret;
+   int d;
+   uint64_t begin, end;
+   uint64_t max_clock_period = 0;
+
+#ifdef CLOCK_MONOTONIC_RAW
+   begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   begin = anv_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+   for (d = 0; d < timestampCount; d++) {
+      switch (pTimestampInfos[d].timeDomain) {
+      case VK_TIME_DOMAIN_DEVICE_EXT:
+         ret = anv_gem_reg_read(device->fd, TIMESTAMP | I915_REG_READ_8B_WA,
+                                &pTimestamps[d]);
+
+         if (ret != 0) {
+            return vk_device_set_lost(&device->vk, "Failed to read the "
+                                      "TIMESTAMP register: %m");
+         }
+         uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
+         max_clock_period = MAX2(max_clock_period, device_period);
+         break;
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
+         pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
+         max_clock_period = MAX2(max_clock_period, 1);
+         break;
+
+#ifdef CLOCK_MONOTONIC_RAW
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+         pTimestamps[d] = begin;
+         break;
+#endif
+      default:
+         pTimestamps[d] = 0;
+         break;
+      }
+   }
+
+#ifdef CLOCK_MONOTONIC_RAW
+   end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
+#else
+   end = anv_clock_gettime(CLOCK_MONOTONIC);
+#endif
+
+    /*
+     * The maximum deviation is the sum of the interval over which we
+     * perform the sampling and the maximum period of any sampled
+     * clock. That's because the maximum skew between any two sampled
+     * clock edges is when the sampled clock with the largest period is
+     * sampled at the end of that period but right at the beginning of the
+     * sampling interval and some other clock is sampled right at the
+     * beginning of its sampling period and right at the end of the
+     * sampling interval. Let's assume the GPU has the longest clock
+     * period and that the application is sampling GPU and monotonic:
+     *
+     *                               s                 e
+     *			 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
+     *	Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
+     *
+     *                               g
+     *		  0         1         2         3
+     *	GPU       -----_____-----_____-----_____-----_____
+     *
+     *                                                m
+     *					    x y z 0 1 2 3 4 5 6 7 8 9 a b c
+     *	Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
+     *
+     *	Interval                     <----------------->
+     *	Deviation           <-------------------------->
+     *
+     *		s  = read(raw)       2
+     *		g  = read(GPU)       1
+     *		m  = read(monotonic) 2
+     *		e  = read(raw)       b
+     *
+     * We round the sample interval up by one tick to cover sampling error
+     * in the interval clock
+     */
+
+   uint64_t sample_interval = end - begin + 1;
+
+   *pMaxDeviation = sample_interval + max_clock_period;
+
+   return VK_SUCCESS;
+}
+
+void anv_GetPhysicalDeviceMultisamplePropertiesEXT(
+    VkPhysicalDevice                            physicalDevice,
+    VkSampleCountFlagBits                       samples,
+    VkMultisamplePropertiesEXT*                 pMultisampleProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   assert(pMultisampleProperties->sType ==
+          VK_STRUCTURE_TYPE_MULTISAMPLE_PROPERTIES_EXT);
+
+   VkExtent2D grid_size;
+   if (samples & isl_device_get_sample_counts(&physical_device->isl_dev)) {
+      grid_size.width = 1;
+      grid_size.height = 1;
+   } else {
+      grid_size.width = 0;
+      grid_size.height = 0;
+   }
+   pMultisampleProperties->maxSampleLocationGridSize = grid_size;
+
+   vk_foreach_struct(ext, pMultisampleProperties->pNext)
+      anv_debug_ignored_stype(ext->sType);
+}
+
+/* vk_icd.h does not declare this function, so we declare it here to
+ * suppress Wmissing-prototypes.
+ */
+PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
+vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion);
+
+PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
+vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
+{
+   /* For the full details on loader interface versioning, see
+    * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>.
+    * What follows is a condensed summary, to help you navigate the large and
+    * confusing official doc.
+    *
+    *   - Loader interface v0 is incompatible with later versions. We don't
+    *     support it.
+    *
+    *   - In loader interface v1:
+    *       - The first ICD entrypoint called by the loader is
+    *         vk_icdGetInstanceProcAddr(). The ICD must statically expose this
+    *         entrypoint.
+    *       - The ICD must statically expose no other Vulkan symbol unless it is
+    *         linked with -Bsymbolic.
+    *       - Each dispatchable Vulkan handle created by the ICD must be
+    *         a pointer to a struct whose first member is VK_LOADER_DATA. The
+    *         ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC.
+    *       - The loader implements vkCreate{PLATFORM}SurfaceKHR() and
+    *         vkDestroySurfaceKHR(). The ICD must be capable of working with
+    *         such loader-managed surfaces.
+    *
+    *    - Loader interface v2 differs from v1 in:
+    *       - The first ICD entrypoint called by the loader is
+    *         vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must
+    *         statically expose this entrypoint.
+    *
+    *    - Loader interface v3 differs from v2 in:
+    *        - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
+    *          vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
+    *          because the loader no longer does so.
+    *
+    *    - Loader interface v4 differs from v3 in:
+    *        - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
+    * 
+    *    - Loader interface v5 differs from v4 in:
+    *        - The ICD must support Vulkan API version 1.1 and must not return 
+    *          VK_ERROR_INCOMPATIBLE_DRIVER from vkCreateInstance() unless a
+    *          Vulkan Loader with interface v4 or smaller is being used and the
+    *          application provides an API version that is greater than 1.0.
+    */
+   *pSupportedVersion = MIN2(*pSupportedVersion, 5u);
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetPhysicalDeviceFragmentShadingRatesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pFragmentShadingRateCount,
+    VkPhysicalDeviceFragmentShadingRateKHR*     pFragmentShadingRates)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
+                          pFragmentShadingRates, pFragmentShadingRateCount);
+
+#define append_rate(_samples, _width, _height)                                      \
+   do {                                                                             \
+      vk_outarray_append_typed(VkPhysicalDeviceFragmentShadingRateKHR, &out, __r) { \
+         __r->sampleCounts = _samples;                                              \
+         __r->fragmentSize = (VkExtent2D) {                                         \
+            .width = _width,                                                        \
+            .height = _height,                                                      \
+         };                                                                         \
+      }                                                                             \
+   } while (0)
+
+   VkSampleCountFlags sample_counts =
+      isl_device_get_sample_counts(&physical_device->isl_dev);
+
+   /* BSpec 47003: There are a number of restrictions on the sample count
+    * based off the coarse pixel size.
+    */
+   static const VkSampleCountFlags cp_size_sample_limits[] = {
+      [1]  = ISL_SAMPLE_COUNT_16_BIT | ISL_SAMPLE_COUNT_8_BIT |
+             ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [2]  = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [4]  = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [8]  = ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [16] = ISL_SAMPLE_COUNT_1_BIT,
+   };
+
+   for (uint32_t x = 4; x >= 1; x /= 2) {
+       for (uint32_t y = 4; y >= 1; y /= 2) {
+          if (physical_device->info.has_coarse_pixel_primitive_and_cb) {
+             /* BSpec 47003:
+              *   "CPsize 1x4 and 4x1 are not supported"
+              */
+             if ((x == 1 && y == 4) || (x == 4 && y == 1))
+                continue;
+
+             /* For size {1, 1}, the sample count must be ~0
+              *
+              * 4x2 is also a specially case.
+              */
+             if (x == 1 && y == 1)
+                append_rate(~0, x, y);
+             else if (x == 4 && y == 2)
+                append_rate(ISL_SAMPLE_COUNT_1_BIT, x, y);
+             else
+                append_rate(cp_size_sample_limits[x * y], x, y);
+          } else {
+             /* For size {1, 1}, the sample count must be ~0 */
+             if (x == 1 && y == 1)
+                append_rate(~0, x, y);
+             else
+                append_rate(sample_counts, x, y);
+          }
+       }
+   }
+
+#undef append_rate
+
+   return vk_outarray_status(&out);
+}
diff --git a/src/intel/vulkan_hasvk/anv_formats.c b/src/intel/vulkan_hasvk/anv_formats.c
new file mode 100644
index 00000000000..029a6080926
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_formats.c
@@ -0,0 +1,1745 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "vk_enum_defines.h"
+#include "vk_enum_to_str.h"
+#include "vk_format.h"
+#include "vk_util.h"
+
+/*
+ * gcc-4 and earlier don't allow compound literals where a constant
+ * is required in -std=c99/gnu99 mode, so we can't use ISL_SWIZZLE()
+ * here. -std=c89/gnu89 would allow it, but we depend on c99 features
+ * so using -std=c89/gnu89 is not an option. Starting from gcc-5
+ * compound literals can also be considered constant in -std=c99/gnu99
+ * mode.
+ */
+#define _ISL_SWIZZLE(r, g, b, a) { \
+      ISL_CHANNEL_SELECT_##r, \
+      ISL_CHANNEL_SELECT_##g, \
+      ISL_CHANNEL_SELECT_##b, \
+      ISL_CHANNEL_SELECT_##a, \
+}
+
+#define RGBA _ISL_SWIZZLE(RED, GREEN, BLUE, ALPHA)
+#define BGRA _ISL_SWIZZLE(BLUE, GREEN, RED, ALPHA)
+#define RGB1 _ISL_SWIZZLE(RED, GREEN, BLUE, ONE)
+
+#define swiz_fmt1(__vk_fmt, __hw_fmt, __swizzle) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         { .isl_format = __hw_fmt, .swizzle = __swizzle, \
+           .denominator_scales = { 1, 1, }, \
+           .aspect = VK_IMAGE_ASPECT_COLOR_BIT, \
+         }, \
+      }, \
+      .vk_format = __vk_fmt, \
+      .n_planes = 1, \
+   }
+
+#define fmt1(__vk_fmt, __hw_fmt) \
+   swiz_fmt1(__vk_fmt, __hw_fmt, RGBA)
+
+#define d_fmt(__vk_fmt, __hw_fmt) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         { .isl_format = __hw_fmt, .swizzle = RGBA, \
+           .denominator_scales = { 1, 1, }, \
+           .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
+         }, \
+      }, \
+      .vk_format = __vk_fmt, \
+      .n_planes = 1, \
+   }
+
+#define s_fmt(__vk_fmt, __hw_fmt) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         { .isl_format = __hw_fmt, .swizzle = RGBA, \
+           .denominator_scales = { 1, 1, }, \
+           .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
+         }, \
+      }, \
+      .vk_format = __vk_fmt, \
+      .n_planes = 1, \
+   }
+
+#define ds_fmt2(__vk_fmt, __fmt1, __fmt2) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         { .isl_format = __fmt1, .swizzle = RGBA, \
+           .denominator_scales = { 1, 1, }, \
+           .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
+         }, \
+         { .isl_format = __fmt2, .swizzle = RGBA, \
+           .denominator_scales = { 1, 1, }, \
+           .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
+         }, \
+      }, \
+      .vk_format = __vk_fmt, \
+      .n_planes = 2, \
+   }
+
+#define fmt_unsupported(__vk_fmt) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         { .isl_format = ISL_FORMAT_UNSUPPORTED, }, \
+      }, \
+      .vk_format = VK_FORMAT_UNDEFINED, \
+   }
+
+#define y_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
+   { .isl_format = __hw_fmt, \
+     .swizzle = __swizzle, \
+     .ycbcr_swizzle = __ycbcr_swizzle, \
+     .denominator_scales = { dhs, dvs, }, \
+     .has_chroma = false, \
+     .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT, /* Y plane is always plane 0 */ \
+   }
+
+#define chroma_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
+   { .isl_format = __hw_fmt, \
+     .swizzle = __swizzle, \
+     .ycbcr_swizzle = __ycbcr_swizzle, \
+     .denominator_scales = { dhs, dvs, }, \
+     .has_chroma = true, \
+     .aspect = VK_IMAGE_ASPECT_PLANE_ ## __plane ## _BIT, \
+   }
+
+#define ycbcr_fmt(__vk_fmt, __n_planes, ...) \
+   [VK_ENUM_OFFSET(__vk_fmt)] = { \
+      .planes = { \
+         __VA_ARGS__, \
+      }, \
+      .vk_format = __vk_fmt, \
+      .n_planes = __n_planes, \
+      .can_ycbcr = true, \
+   }
+
+/* HINT: For array formats, the ISL name should match the VK name.  For
+ * packed formats, they should have the channels in reverse order from each
+ * other.  The reason for this is that, for packed formats, the ISL (and
+ * bspec) names are in LSB -> MSB order while VK formats are MSB -> LSB.
+ */
+static const struct anv_format main_formats[] = {
+   fmt_unsupported(VK_FORMAT_UNDEFINED),
+   fmt_unsupported(VK_FORMAT_R4G4_UNORM_PACK8),
+   fmt1(VK_FORMAT_R4G4B4A4_UNORM_PACK16,             ISL_FORMAT_A4B4G4R4_UNORM),
+   swiz_fmt1(VK_FORMAT_B4G4R4A4_UNORM_PACK16,        ISL_FORMAT_A4B4G4R4_UNORM,  BGRA),
+   fmt1(VK_FORMAT_R5G6B5_UNORM_PACK16,               ISL_FORMAT_B5G6R5_UNORM),
+   swiz_fmt1(VK_FORMAT_B5G6R5_UNORM_PACK16,          ISL_FORMAT_B5G6R5_UNORM, BGRA),
+   fmt1(VK_FORMAT_R5G5B5A1_UNORM_PACK16,             ISL_FORMAT_A1B5G5R5_UNORM),
+   swiz_fmt1(VK_FORMAT_B5G5R5A1_UNORM_PACK16,        ISL_FORMAT_A1B5G5R5_UNORM, BGRA),
+   fmt1(VK_FORMAT_A1R5G5B5_UNORM_PACK16,             ISL_FORMAT_B5G5R5A1_UNORM),
+   fmt1(VK_FORMAT_R8_UNORM,                          ISL_FORMAT_R8_UNORM),
+   fmt1(VK_FORMAT_R8_SNORM,                          ISL_FORMAT_R8_SNORM),
+   fmt1(VK_FORMAT_R8_USCALED,                        ISL_FORMAT_R8_USCALED),
+   fmt1(VK_FORMAT_R8_SSCALED,                        ISL_FORMAT_R8_SSCALED),
+   fmt1(VK_FORMAT_R8_UINT,                           ISL_FORMAT_R8_UINT),
+   fmt1(VK_FORMAT_R8_SINT,                           ISL_FORMAT_R8_SINT),
+   swiz_fmt1(VK_FORMAT_R8_SRGB,                      ISL_FORMAT_L8_UNORM_SRGB,
+                                                     _ISL_SWIZZLE(RED, ZERO, ZERO, ONE)),
+   fmt1(VK_FORMAT_R8G8_UNORM,                        ISL_FORMAT_R8G8_UNORM),
+   fmt1(VK_FORMAT_R8G8_SNORM,                        ISL_FORMAT_R8G8_SNORM),
+   fmt1(VK_FORMAT_R8G8_USCALED,                      ISL_FORMAT_R8G8_USCALED),
+   fmt1(VK_FORMAT_R8G8_SSCALED,                      ISL_FORMAT_R8G8_SSCALED),
+   fmt1(VK_FORMAT_R8G8_UINT,                         ISL_FORMAT_R8G8_UINT),
+   fmt1(VK_FORMAT_R8G8_SINT,                         ISL_FORMAT_R8G8_SINT),
+   fmt_unsupported(VK_FORMAT_R8G8_SRGB),             /* L8A8_UNORM_SRGB */
+   fmt1(VK_FORMAT_R8G8B8_UNORM,                      ISL_FORMAT_R8G8B8_UNORM),
+   fmt1(VK_FORMAT_R8G8B8_SNORM,                      ISL_FORMAT_R8G8B8_SNORM),
+   fmt1(VK_FORMAT_R8G8B8_USCALED,                    ISL_FORMAT_R8G8B8_USCALED),
+   fmt1(VK_FORMAT_R8G8B8_SSCALED,                    ISL_FORMAT_R8G8B8_SSCALED),
+   fmt1(VK_FORMAT_R8G8B8_UINT,                       ISL_FORMAT_R8G8B8_UINT),
+   fmt1(VK_FORMAT_R8G8B8_SINT,                       ISL_FORMAT_R8G8B8_SINT),
+   fmt1(VK_FORMAT_R8G8B8_SRGB,                       ISL_FORMAT_R8G8B8_UNORM_SRGB),
+   fmt1(VK_FORMAT_R8G8B8A8_UNORM,                    ISL_FORMAT_R8G8B8A8_UNORM),
+   fmt1(VK_FORMAT_R8G8B8A8_SNORM,                    ISL_FORMAT_R8G8B8A8_SNORM),
+   fmt1(VK_FORMAT_R8G8B8A8_USCALED,                  ISL_FORMAT_R8G8B8A8_USCALED),
+   fmt1(VK_FORMAT_R8G8B8A8_SSCALED,                  ISL_FORMAT_R8G8B8A8_SSCALED),
+   fmt1(VK_FORMAT_R8G8B8A8_UINT,                     ISL_FORMAT_R8G8B8A8_UINT),
+   fmt1(VK_FORMAT_R8G8B8A8_SINT,                     ISL_FORMAT_R8G8B8A8_SINT),
+   fmt1(VK_FORMAT_R8G8B8A8_SRGB,                     ISL_FORMAT_R8G8B8A8_UNORM_SRGB),
+   fmt1(VK_FORMAT_A8B8G8R8_UNORM_PACK32,             ISL_FORMAT_R8G8B8A8_UNORM),
+   fmt1(VK_FORMAT_A8B8G8R8_SNORM_PACK32,             ISL_FORMAT_R8G8B8A8_SNORM),
+   fmt1(VK_FORMAT_A8B8G8R8_USCALED_PACK32,           ISL_FORMAT_R8G8B8A8_USCALED),
+   fmt1(VK_FORMAT_A8B8G8R8_SSCALED_PACK32,           ISL_FORMAT_R8G8B8A8_SSCALED),
+   fmt1(VK_FORMAT_A8B8G8R8_UINT_PACK32,              ISL_FORMAT_R8G8B8A8_UINT),
+   fmt1(VK_FORMAT_A8B8G8R8_SINT_PACK32,              ISL_FORMAT_R8G8B8A8_SINT),
+   fmt1(VK_FORMAT_A8B8G8R8_SRGB_PACK32,              ISL_FORMAT_R8G8B8A8_UNORM_SRGB),
+   fmt1(VK_FORMAT_A2R10G10B10_UNORM_PACK32,          ISL_FORMAT_B10G10R10A2_UNORM),
+   fmt1(VK_FORMAT_A2R10G10B10_SNORM_PACK32,          ISL_FORMAT_B10G10R10A2_SNORM),
+   fmt1(VK_FORMAT_A2R10G10B10_USCALED_PACK32,        ISL_FORMAT_B10G10R10A2_USCALED),
+   fmt1(VK_FORMAT_A2R10G10B10_SSCALED_PACK32,        ISL_FORMAT_B10G10R10A2_SSCALED),
+   fmt1(VK_FORMAT_A2R10G10B10_UINT_PACK32,           ISL_FORMAT_B10G10R10A2_UINT),
+   fmt1(VK_FORMAT_A2R10G10B10_SINT_PACK32,           ISL_FORMAT_B10G10R10A2_SINT),
+   fmt1(VK_FORMAT_A2B10G10R10_UNORM_PACK32,          ISL_FORMAT_R10G10B10A2_UNORM),
+   fmt1(VK_FORMAT_A2B10G10R10_SNORM_PACK32,          ISL_FORMAT_R10G10B10A2_SNORM),
+   fmt1(VK_FORMAT_A2B10G10R10_USCALED_PACK32,        ISL_FORMAT_R10G10B10A2_USCALED),
+   fmt1(VK_FORMAT_A2B10G10R10_SSCALED_PACK32,        ISL_FORMAT_R10G10B10A2_SSCALED),
+   fmt1(VK_FORMAT_A2B10G10R10_UINT_PACK32,           ISL_FORMAT_R10G10B10A2_UINT),
+   fmt1(VK_FORMAT_A2B10G10R10_SINT_PACK32,           ISL_FORMAT_R10G10B10A2_SINT),
+   fmt1(VK_FORMAT_R16_UNORM,                         ISL_FORMAT_R16_UNORM),
+   fmt1(VK_FORMAT_R16_SNORM,                         ISL_FORMAT_R16_SNORM),
+   fmt1(VK_FORMAT_R16_USCALED,                       ISL_FORMAT_R16_USCALED),
+   fmt1(VK_FORMAT_R16_SSCALED,                       ISL_FORMAT_R16_SSCALED),
+   fmt1(VK_FORMAT_R16_UINT,                          ISL_FORMAT_R16_UINT),
+   fmt1(VK_FORMAT_R16_SINT,                          ISL_FORMAT_R16_SINT),
+   fmt1(VK_FORMAT_R16_SFLOAT,                        ISL_FORMAT_R16_FLOAT),
+   fmt1(VK_FORMAT_R16G16_UNORM,                      ISL_FORMAT_R16G16_UNORM),
+   fmt1(VK_FORMAT_R16G16_SNORM,                      ISL_FORMAT_R16G16_SNORM),
+   fmt1(VK_FORMAT_R16G16_USCALED,                    ISL_FORMAT_R16G16_USCALED),
+   fmt1(VK_FORMAT_R16G16_SSCALED,                    ISL_FORMAT_R16G16_SSCALED),
+   fmt1(VK_FORMAT_R16G16_UINT,                       ISL_FORMAT_R16G16_UINT),
+   fmt1(VK_FORMAT_R16G16_SINT,                       ISL_FORMAT_R16G16_SINT),
+   fmt1(VK_FORMAT_R16G16_SFLOAT,                     ISL_FORMAT_R16G16_FLOAT),
+   fmt1(VK_FORMAT_R16G16B16_UNORM,                   ISL_FORMAT_R16G16B16_UNORM),
+   fmt1(VK_FORMAT_R16G16B16_SNORM,                   ISL_FORMAT_R16G16B16_SNORM),
+   fmt1(VK_FORMAT_R16G16B16_USCALED,                 ISL_FORMAT_R16G16B16_USCALED),
+   fmt1(VK_FORMAT_R16G16B16_SSCALED,                 ISL_FORMAT_R16G16B16_SSCALED),
+   fmt1(VK_FORMAT_R16G16B16_UINT,                    ISL_FORMAT_R16G16B16_UINT),
+   fmt1(VK_FORMAT_R16G16B16_SINT,                    ISL_FORMAT_R16G16B16_SINT),
+   fmt1(VK_FORMAT_R16G16B16_SFLOAT,                  ISL_FORMAT_R16G16B16_FLOAT),
+   fmt1(VK_FORMAT_R16G16B16A16_UNORM,                ISL_FORMAT_R16G16B16A16_UNORM),
+   fmt1(VK_FORMAT_R16G16B16A16_SNORM,                ISL_FORMAT_R16G16B16A16_SNORM),
+   fmt1(VK_FORMAT_R16G16B16A16_USCALED,              ISL_FORMAT_R16G16B16A16_USCALED),
+   fmt1(VK_FORMAT_R16G16B16A16_SSCALED,              ISL_FORMAT_R16G16B16A16_SSCALED),
+   fmt1(VK_FORMAT_R16G16B16A16_UINT,                 ISL_FORMAT_R16G16B16A16_UINT),
+   fmt1(VK_FORMAT_R16G16B16A16_SINT,                 ISL_FORMAT_R16G16B16A16_SINT),
+   fmt1(VK_FORMAT_R16G16B16A16_SFLOAT,               ISL_FORMAT_R16G16B16A16_FLOAT),
+   fmt1(VK_FORMAT_R32_UINT,                          ISL_FORMAT_R32_UINT),
+   fmt1(VK_FORMAT_R32_SINT,                          ISL_FORMAT_R32_SINT),
+   fmt1(VK_FORMAT_R32_SFLOAT,                        ISL_FORMAT_R32_FLOAT),
+   fmt1(VK_FORMAT_R32G32_UINT,                       ISL_FORMAT_R32G32_UINT),
+   fmt1(VK_FORMAT_R32G32_SINT,                       ISL_FORMAT_R32G32_SINT),
+   fmt1(VK_FORMAT_R32G32_SFLOAT,                     ISL_FORMAT_R32G32_FLOAT),
+   fmt1(VK_FORMAT_R32G32B32_UINT,                    ISL_FORMAT_R32G32B32_UINT),
+   fmt1(VK_FORMAT_R32G32B32_SINT,                    ISL_FORMAT_R32G32B32_SINT),
+   fmt1(VK_FORMAT_R32G32B32_SFLOAT,                  ISL_FORMAT_R32G32B32_FLOAT),
+   fmt1(VK_FORMAT_R32G32B32A32_UINT,                 ISL_FORMAT_R32G32B32A32_UINT),
+   fmt1(VK_FORMAT_R32G32B32A32_SINT,                 ISL_FORMAT_R32G32B32A32_SINT),
+   fmt1(VK_FORMAT_R32G32B32A32_SFLOAT,               ISL_FORMAT_R32G32B32A32_FLOAT),
+   fmt1(VK_FORMAT_R64_UINT,                          ISL_FORMAT_R64_PASSTHRU),
+   fmt1(VK_FORMAT_R64_SINT,                          ISL_FORMAT_R64_PASSTHRU),
+   fmt1(VK_FORMAT_R64_SFLOAT,                        ISL_FORMAT_R64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64_UINT,                       ISL_FORMAT_R64G64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64_SINT,                       ISL_FORMAT_R64G64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64_SFLOAT,                     ISL_FORMAT_R64G64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64_UINT,                    ISL_FORMAT_R64G64B64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64_SINT,                    ISL_FORMAT_R64G64B64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64_SFLOAT,                  ISL_FORMAT_R64G64B64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64A64_UINT,                 ISL_FORMAT_R64G64B64A64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64A64_SINT,                 ISL_FORMAT_R64G64B64A64_PASSTHRU),
+   fmt1(VK_FORMAT_R64G64B64A64_SFLOAT,               ISL_FORMAT_R64G64B64A64_PASSTHRU),
+   fmt1(VK_FORMAT_B10G11R11_UFLOAT_PACK32,           ISL_FORMAT_R11G11B10_FLOAT),
+   fmt1(VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,            ISL_FORMAT_R9G9B9E5_SHAREDEXP),
+
+   d_fmt(VK_FORMAT_D16_UNORM,                        ISL_FORMAT_R16_UNORM),
+   d_fmt(VK_FORMAT_X8_D24_UNORM_PACK32,              ISL_FORMAT_R24_UNORM_X8_TYPELESS),
+   d_fmt(VK_FORMAT_D32_SFLOAT,                       ISL_FORMAT_R32_FLOAT),
+   s_fmt(VK_FORMAT_S8_UINT,                          ISL_FORMAT_R8_UINT),
+   fmt_unsupported(VK_FORMAT_D16_UNORM_S8_UINT),
+   ds_fmt2(VK_FORMAT_D24_UNORM_S8_UINT,              ISL_FORMAT_R24_UNORM_X8_TYPELESS, ISL_FORMAT_R8_UINT),
+   ds_fmt2(VK_FORMAT_D32_SFLOAT_S8_UINT,             ISL_FORMAT_R32_FLOAT, ISL_FORMAT_R8_UINT),
+
+   swiz_fmt1(VK_FORMAT_BC1_RGB_UNORM_BLOCK,          ISL_FORMAT_BC1_UNORM, RGB1),
+   swiz_fmt1(VK_FORMAT_BC1_RGB_SRGB_BLOCK,           ISL_FORMAT_BC1_UNORM_SRGB, RGB1),
+   fmt1(VK_FORMAT_BC1_RGBA_UNORM_BLOCK,              ISL_FORMAT_BC1_UNORM),
+   fmt1(VK_FORMAT_BC1_RGBA_SRGB_BLOCK,               ISL_FORMAT_BC1_UNORM_SRGB),
+   fmt1(VK_FORMAT_BC2_UNORM_BLOCK,                   ISL_FORMAT_BC2_UNORM),
+   fmt1(VK_FORMAT_BC2_SRGB_BLOCK,                    ISL_FORMAT_BC2_UNORM_SRGB),
+   fmt1(VK_FORMAT_BC3_UNORM_BLOCK,                   ISL_FORMAT_BC3_UNORM),
+   fmt1(VK_FORMAT_BC3_SRGB_BLOCK,                    ISL_FORMAT_BC3_UNORM_SRGB),
+   fmt1(VK_FORMAT_BC4_UNORM_BLOCK,                   ISL_FORMAT_BC4_UNORM),
+   fmt1(VK_FORMAT_BC4_SNORM_BLOCK,                   ISL_FORMAT_BC4_SNORM),
+   fmt1(VK_FORMAT_BC5_UNORM_BLOCK,                   ISL_FORMAT_BC5_UNORM),
+   fmt1(VK_FORMAT_BC5_SNORM_BLOCK,                   ISL_FORMAT_BC5_SNORM),
+   fmt1(VK_FORMAT_BC6H_UFLOAT_BLOCK,                 ISL_FORMAT_BC6H_UF16),
+   fmt1(VK_FORMAT_BC6H_SFLOAT_BLOCK,                 ISL_FORMAT_BC6H_SF16),
+   fmt1(VK_FORMAT_BC7_UNORM_BLOCK,                   ISL_FORMAT_BC7_UNORM),
+   fmt1(VK_FORMAT_BC7_SRGB_BLOCK,                    ISL_FORMAT_BC7_UNORM_SRGB),
+   fmt1(VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK,           ISL_FORMAT_ETC2_RGB8),
+   fmt1(VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK,            ISL_FORMAT_ETC2_SRGB8),
+   fmt1(VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK,         ISL_FORMAT_ETC2_RGB8_PTA),
+   fmt1(VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK,          ISL_FORMAT_ETC2_SRGB8_PTA),
+   fmt1(VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK,         ISL_FORMAT_ETC2_EAC_RGBA8),
+   fmt1(VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK,          ISL_FORMAT_ETC2_EAC_SRGB8_A8),
+   fmt1(VK_FORMAT_EAC_R11_UNORM_BLOCK,               ISL_FORMAT_EAC_R11),
+   fmt1(VK_FORMAT_EAC_R11_SNORM_BLOCK,               ISL_FORMAT_EAC_SIGNED_R11),
+   fmt1(VK_FORMAT_EAC_R11G11_UNORM_BLOCK,            ISL_FORMAT_EAC_RG11),
+   fmt1(VK_FORMAT_EAC_R11G11_SNORM_BLOCK,            ISL_FORMAT_EAC_SIGNED_RG11),
+   fmt1(VK_FORMAT_ASTC_4x4_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_5x4_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_5x5_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_6x5_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_6x6_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_8x5_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_8x6_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_8x8_SRGB_BLOCK,               ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_10x5_SRGB_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_10x6_SRGB_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_10x8_SRGB_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_10x10_SRGB_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_12x10_SRGB_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_12x12_SRGB_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB),
+   fmt1(VK_FORMAT_ASTC_4x4_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16),
+   fmt1(VK_FORMAT_ASTC_5x4_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16),
+   fmt1(VK_FORMAT_ASTC_5x5_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16),
+   fmt1(VK_FORMAT_ASTC_6x5_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16),
+   fmt1(VK_FORMAT_ASTC_6x6_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16),
+   fmt1(VK_FORMAT_ASTC_8x5_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16),
+   fmt1(VK_FORMAT_ASTC_8x6_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16),
+   fmt1(VK_FORMAT_ASTC_8x8_UNORM_BLOCK,              ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16),
+   fmt1(VK_FORMAT_ASTC_10x5_UNORM_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16),
+   fmt1(VK_FORMAT_ASTC_10x6_UNORM_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16),
+   fmt1(VK_FORMAT_ASTC_10x8_UNORM_BLOCK,             ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16),
+   fmt1(VK_FORMAT_ASTC_10x10_UNORM_BLOCK,            ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16),
+   fmt1(VK_FORMAT_ASTC_12x10_UNORM_BLOCK,            ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16),
+   fmt1(VK_FORMAT_ASTC_12x12_UNORM_BLOCK,            ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16),
+   fmt_unsupported(VK_FORMAT_B8G8R8_UNORM),
+   fmt_unsupported(VK_FORMAT_B8G8R8_SNORM),
+   fmt_unsupported(VK_FORMAT_B8G8R8_USCALED),
+   fmt_unsupported(VK_FORMAT_B8G8R8_SSCALED),
+   fmt_unsupported(VK_FORMAT_B8G8R8_UINT),
+   fmt_unsupported(VK_FORMAT_B8G8R8_SINT),
+   fmt_unsupported(VK_FORMAT_B8G8R8_SRGB),
+   fmt1(VK_FORMAT_B8G8R8A8_UNORM,                    ISL_FORMAT_B8G8R8A8_UNORM),
+   fmt_unsupported(VK_FORMAT_B8G8R8A8_SNORM),
+   fmt_unsupported(VK_FORMAT_B8G8R8A8_USCALED),
+   fmt_unsupported(VK_FORMAT_B8G8R8A8_SSCALED),
+   fmt_unsupported(VK_FORMAT_B8G8R8A8_UINT),
+   fmt_unsupported(VK_FORMAT_B8G8R8A8_SINT),
+   fmt1(VK_FORMAT_B8G8R8A8_SRGB,                     ISL_FORMAT_B8G8R8A8_UNORM_SRGB),
+};
+
+static const struct anv_format _4444_formats[] = {
+   fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16, ISL_FORMAT_B4G4R4A4_UNORM),
+   fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16),
+};
+
+static const struct anv_format ycbcr_formats[] = {
+   ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1,
+             y_plane(0, ISL_FORMAT_YCRCB_SWAPUV, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
+   ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1,
+             y_plane(0, ISL_FORMAT_YCRCB_SWAPUVY, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
+             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
+   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2,
+             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
+             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
+   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2,
+             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+
+   fmt_unsupported(VK_FORMAT_R10X6_UNORM_PACK16),
+   fmt_unsupported(VK_FORMAT_R10X6G10X6_UNORM_2PACK16),
+   fmt_unsupported(VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_R12X4_UNORM_PACK16),
+   fmt_unsupported(VK_FORMAT_R12X4G12X4_UNORM_2PACK16),
+   fmt_unsupported(VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16),
+   fmt_unsupported(VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16),
+   /* TODO: it is possible to enable the following 2 formats, but that
+    * requires further refactoring of how we handle multiplanar formats.
+    */
+   fmt_unsupported(VK_FORMAT_G16B16G16R16_422_UNORM),
+   fmt_unsupported(VK_FORMAT_B16G16R16G16_422_UNORM),
+
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
+             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
+   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2,
+             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
+             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
+   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2,
+             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3,
+             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
+             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+};
+
+#undef _fmt
+#undef swiz_fmt1
+#undef fmt1
+#undef fmt
+
+static const struct {
+   const struct anv_format *formats;
+   uint32_t n_formats;
+} anv_formats[] = {
+   [0]                                       = { .formats = main_formats,
+                                                 .n_formats = ARRAY_SIZE(main_formats), },
+   [_VK_EXT_4444_formats_number]             = { .formats = _4444_formats,
+                                                 .n_formats = ARRAY_SIZE(_4444_formats), },
+   [_VK_KHR_sampler_ycbcr_conversion_number] = { .formats = ycbcr_formats,
+                                                 .n_formats = ARRAY_SIZE(ycbcr_formats), },
+};
+
+const struct anv_format *
+anv_get_format(VkFormat vk_format)
+{
+   uint32_t enum_offset = VK_ENUM_OFFSET(vk_format);
+   uint32_t ext_number = VK_ENUM_EXTENSION(vk_format);
+
+   if (ext_number >= ARRAY_SIZE(anv_formats) ||
+       enum_offset >= anv_formats[ext_number].n_formats)
+      return NULL;
+
+   const struct anv_format *format =
+      &anv_formats[ext_number].formats[enum_offset];
+   if (format->planes[0].isl_format == ISL_FORMAT_UNSUPPORTED)
+      return NULL;
+
+   return format;
+}
+
+/** Return true if any format plane has non-power-of-two bits-per-block. */
+static bool
+anv_format_has_npot_plane(const struct anv_format *anv_format) {
+   for (uint32_t i = 0; i < anv_format->n_planes; ++i) {
+      const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(anv_format->planes[i].isl_format);
+
+      if (!util_is_power_of_two_or_zero(isl_layout->bpb))
+         return true;
+   }
+
+   return false;
+}
+
+/**
+ * Exactly one bit must be set in \a aspect.
+ *
+ * If tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, then return the
+ * requested anv_format_plane without checking for compatibility with modifiers.
+ * It is the caller's responsibility to verify that the the returned
+ * anv_format_plane is compatible with a particular modifier.  (Observe that
+ * this function has no parameter for the DRM format modifier, and therefore
+ * _cannot_ check for compatibility).
+ */
+struct anv_format_plane
+anv_get_format_plane(const struct intel_device_info *devinfo,
+                     VkFormat vk_format, uint32_t plane,
+                     VkImageTiling tiling)
+{
+   const struct anv_format *format = anv_get_format(vk_format);
+   const struct anv_format_plane unsupported = {
+      .isl_format = ISL_FORMAT_UNSUPPORTED,
+   };
+
+   if (format == NULL)
+      return unsupported;
+
+   assert(plane < format->n_planes);
+   struct anv_format_plane plane_format = format->planes[plane];
+   if (plane_format.isl_format == ISL_FORMAT_UNSUPPORTED)
+      return unsupported;
+
+   if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+      return plane_format;
+
+   if (vk_format_is_depth_or_stencil(vk_format))
+      return plane_format;
+
+   const struct isl_format_layout *isl_layout =
+      isl_format_get_layout(plane_format.isl_format);
+
+   /* On Ivy Bridge we don't even have enough 24 and 48-bit formats that we
+    * can reliably do texture upload with BLORP so just don't claim support
+    * for any of them.
+    */
+   if (devinfo->verx10 == 70 &&
+       (isl_layout->bpb == 24 || isl_layout->bpb == 48))
+      return unsupported;
+
+   if (tiling == VK_IMAGE_TILING_OPTIMAL &&
+       !util_is_power_of_two_or_zero(isl_layout->bpb)) {
+      /* Tiled formats *must* be power-of-two because we need up upload
+       * them with the render pipeline.  For 3-channel formats, we fix
+       * this by switching them over to RGBX or RGBA formats under the
+       * hood.
+       */
+      enum isl_format rgbx = isl_format_rgb_to_rgbx(plane_format.isl_format);
+      if (rgbx != ISL_FORMAT_UNSUPPORTED &&
+          isl_format_supports_rendering(devinfo, rgbx)) {
+         plane_format.isl_format = rgbx;
+      } else {
+         plane_format.isl_format =
+            isl_format_rgb_to_rgba(plane_format.isl_format);
+         plane_format.swizzle = ISL_SWIZZLE(RED, GREEN, BLUE, ONE);
+      }
+   }
+
+   /* The B4G4R4A4 format isn't available prior to Broadwell so we have to fall
+    * back to a format with a more complex swizzle.
+    */
+   if (vk_format == VK_FORMAT_B4G4R4A4_UNORM_PACK16 && devinfo->ver < 8) {
+      plane_format.isl_format = ISL_FORMAT_B4G4R4A4_UNORM;
+      plane_format.swizzle = ISL_SWIZZLE(GREEN, RED, ALPHA, BLUE);
+   }
+
+   return plane_format;
+}
+
+struct anv_format_plane
+anv_get_format_aspect(const struct intel_device_info *devinfo,
+                      VkFormat vk_format,
+                      VkImageAspectFlagBits aspect, VkImageTiling tiling)
+{
+   const uint32_t plane =
+      anv_aspect_to_plane(vk_format_aspects(vk_format), aspect);
+   return anv_get_format_plane(devinfo, vk_format, plane, tiling);
+}
+
+// Format capabilities
+
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct intel_device_info *devinfo,
+                               VkFormat vk_format,
+                               const struct anv_format *anv_format,
+                               VkImageTiling vk_tiling,
+                               const struct isl_drm_modifier_info *isl_mod_info)
+{
+   VkFormatFeatureFlags2 flags = 0;
+
+   if (anv_format == NULL)
+      return 0;
+
+   assert((isl_mod_info != NULL) ==
+          (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT));
+
+   const VkImageAspectFlags aspects = vk_format_aspects(vk_format);
+
+   if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+      if (vk_tiling == VK_IMAGE_TILING_LINEAR ||
+          vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+         return 0;
+
+      flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT |
+               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+
+      if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && devinfo->ver >= 9)
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+
+      if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+
+      return flags;
+   }
+
+   assert(aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+   const struct anv_format_plane plane_format =
+      anv_get_format_plane(devinfo, vk_format, 0, vk_tiling);
+
+   if (plane_format.isl_format == ISL_FORMAT_UNSUPPORTED)
+      return 0;
+
+   struct anv_format_plane base_plane_format = plane_format;
+   if (vk_tiling != VK_IMAGE_TILING_LINEAR) {
+      base_plane_format = anv_get_format_plane(devinfo, vk_format, 0,
+                                               VK_IMAGE_TILING_LINEAR);
+   }
+
+   enum isl_format base_isl_format = base_plane_format.isl_format;
+
+   if (isl_format_supports_sampling(devinfo, plane_format.isl_format)) {
+      /* ASTC textures must be in Y-tiled memory, and we reject compressed
+       * formats with modifiers. We do however interpret ASTC textures with
+       * uncompressed formats during data transfers.
+       */
+      if (vk_tiling != VK_IMAGE_TILING_OPTIMAL &&
+          isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
+         return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT;
+
+      if (devinfo->ver >= 9)
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+
+      if (isl_format_supports_filtering(devinfo, plane_format.isl_format))
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+   }
+
+   /* We can render to swizzled formats.  However, if the alpha channel is
+    * moved, then blending won't work correctly.  The PRM tells us
+    * straight-up not to render to such a surface.
+    */
+   if (isl_format_supports_rendering(devinfo, plane_format.isl_format) &&
+       plane_format.swizzle.a == ISL_CHANNEL_SELECT_ALPHA) {
+      flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+
+      /* While we can render to swizzled formats, they don't blend correctly
+       * if there are blend constants involved.  The swizzle just remaps the
+       * output of the shader to different channels in the texture.  It
+       * doesn't change the interpretation of the constant blend factors in
+       * COLOR_CALC_STATE.
+       */
+      if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format) &&
+          isl_swizzle_is_identity(plane_format.swizzle))
+         flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
+   }
+
+   /* Load/store is determined based on base format.  This prevents RGB
+    * formats from showing up as load/store capable.
+    */
+   if (isl_format_supports_typed_reads(devinfo, base_isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+   if (isl_format_supports_typed_writes(devinfo, base_isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+
+   /* Keep this old behavior on VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT.
+    * When KHR_format_features2 is enabled, applications should only rely on
+    * it for the list of shader storage extended formats [1]. Before that,
+    * this applies to all VkFormats.
+    *
+    * [1] : https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#features-shaderStorageImageExtendedFormats
+    */
+   if (flags & VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT)
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
+   if (base_isl_format == ISL_FORMAT_R32_SINT ||
+       base_isl_format == ISL_FORMAT_R32_UINT ||
+       base_isl_format == ISL_FORMAT_R32_FLOAT)
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+
+   if (flags) {
+      flags |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      /* Blit destination requires rendering support. */
+      if (isl_format_supports_rendering(devinfo, plane_format.isl_format))
+         flags |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
+   }
+
+   /* XXX: We handle 3-channel formats by switching them out for RGBX or
+    * RGBA formats behind-the-scenes.  This works fine for textures
+    * because the upload process will fill in the extra channel.
+    * We could also support it for render targets, but it will take
+    * substantially more work and we have enough RGBX formats to handle
+    * what most clients will want.
+    */
+   if (vk_tiling == VK_IMAGE_TILING_OPTIMAL &&
+       base_isl_format != ISL_FORMAT_UNSUPPORTED &&
+       !util_is_power_of_two_or_zero(isl_format_layouts[base_isl_format].bpb) &&
+       isl_format_rgb_to_rgbx(base_isl_format) == ISL_FORMAT_UNSUPPORTED) {
+      flags &= ~VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+      flags &= ~VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
+   }
+
+   if (anv_format->can_ycbcr) {
+      /* The sampler doesn't have support for mid point when it handles YUV on
+       * its own.
+       */
+      if (isl_format_is_yuv(anv_format->planes[0].isl_format)) {
+         /* TODO: We've disabled linear implicit reconstruction with the
+          * sampler. The failures show a slightly out of range values on the
+          * bottom left of the sampled image.
+          */
+         flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
+      } else {
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT |
+                  VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT |
+                  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT;
+      }
+
+      /* We can support cosited chroma locations when handle planes with our
+       * own shader snippets.
+       */
+      for (unsigned p = 0; p < anv_format->n_planes; p++) {
+         if (anv_format->planes[p].denominator_scales[0] > 1 ||
+             anv_format->planes[p].denominator_scales[1] > 1) {
+            flags |= VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT;
+            break;
+         }
+      }
+
+      if (anv_format->n_planes > 1)
+         flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT;
+
+      const VkFormatFeatureFlags2 disallowed_ycbcr_image_features =
+         VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+         VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+         VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+         VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+         VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
+      flags &= ~disallowed_ycbcr_image_features;
+   }
+
+   if (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      if (!isl_drm_modifier_get_score(devinfo, isl_mod_info->modifier))
+         return 0;
+
+      /* Try to restrict the supported formats to those in drm_fourcc.h. The
+       * VK_EXT_image_drm_format_modifier does not require this (after all, two
+       * Vulkan apps could share an image by exchanging its VkFormat instead of
+       * a DRM_FORMAT), but there exist no users of such non-drm_fourcc formats
+       * yet. And the restriction shrinks our test surface.
+       */
+      const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(plane_format.isl_format);
+
+      switch (isl_layout->colorspace) {
+      case ISL_COLORSPACE_LINEAR:
+      case ISL_COLORSPACE_SRGB:
+         /* Each DRM_FORMAT that we support uses unorm (if the DRM format name
+          * has no type suffix) or sfloat (if it has suffix F). No format
+          * contains mixed types. (as of 2021-06-14)
+          */
+         if (isl_layout->uniform_channel_type != ISL_UNORM &&
+             isl_layout->uniform_channel_type != ISL_SFLOAT)
+            return 0;
+         break;
+      case ISL_COLORSPACE_YUV:
+         anv_finishme("support YUV colorspace with DRM format modifiers");
+         return 0;
+      case ISL_COLORSPACE_NONE:
+         return 0;
+      }
+
+      /* We could support compressed formats if we wanted to. */
+      if (isl_format_is_compressed(plane_format.isl_format))
+         return 0;
+
+      /* No non-power-of-two fourcc formats exist.
+       *
+       * Even if non-power-of-two fourcc formats existed, we could support them
+       * only with DRM_FORMAT_MOD_LINEAR.  Tiled formats must be power-of-two
+       * because we implement transfers with the render pipeline.
+       */
+      if (anv_format_has_npot_plane(anv_format))
+         return 0;
+
+      if (anv_format->n_planes > 1) {
+         /* For simplicity, keep DISJOINT disabled for multi-planar format. */
+         flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
+
+         /* VK_ANDROID_external_memory_android_hardware_buffer in Virtio-GPU
+          * Venus driver layers on top of VK_EXT_image_drm_format_modifier of
+          * the host Vulkan driver, and both VK_FORMAT_G8_B8R8_2PLANE_420_UNORM
+          * and VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM and required to support
+          * camera/media interop in Android.
+          */
+         if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM &&
+             vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
+            anv_finishme("support more multi-planar formats with DRM modifiers");
+            return 0;
+         }
+
+         /* Currently there is no way to properly map memory planes to format
+          * planes and aux planes due to the lack of defined ABI for external
+          * multi-planar images.
+          */
+         if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+            return 0;
+         }
+      }
+
+      if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
+          !isl_format_supports_ccs_e(devinfo, plane_format.isl_format)) {
+         return 0;
+      }
+
+      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+         /* Rejection DISJOINT for consistency with the GL driver. In
+          * eglCreateImage, we require that the dma_buf for the primary surface
+          * and the dma_buf for its aux surface refer to the same bo.
+          */
+         flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
+
+         /* When the hardware accesses a storage image, it bypasses the aux
+          * surface. We could support storage access on images with aux
+          * modifiers by resolving the aux surface prior to the storage access.
+          */
+         flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+         flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
+      }
+   }
+
+   if (devinfo->has_coarse_pixel_primitive_and_cb &&
+       vk_format == VK_FORMAT_R8_UINT &&
+       vk_tiling == VK_IMAGE_TILING_OPTIMAL)
+      flags |= VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
+
+   return flags;
+}
+
+static VkFormatFeatureFlags2
+get_buffer_format_features2(const struct intel_device_info *devinfo,
+                            VkFormat vk_format,
+                            const struct anv_format *anv_format)
+{
+   VkFormatFeatureFlags2 flags = 0;
+
+   if (anv_format == NULL)
+      return 0;
+
+   const enum isl_format isl_format = anv_format->planes[0].isl_format;
+
+   if (isl_format == ISL_FORMAT_UNSUPPORTED)
+      return 0;
+
+   if (anv_format->n_planes > 1)
+      return 0;
+
+   if (anv_format->can_ycbcr)
+      return 0;
+
+   if (vk_format_is_depth_or_stencil(vk_format))
+      return 0;
+
+   if (isl_format_supports_sampling(devinfo, isl_format) &&
+       !isl_format_is_compressed(isl_format))
+      flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT;
+
+   if (isl_format_supports_vertex_fetch(devinfo, isl_format))
+      flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
+
+   if (isl_is_storage_image_format(isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
+
+   if (isl_format == ISL_FORMAT_R32_SINT || isl_format == ISL_FORMAT_R32_UINT)
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+
+   return flags;
+}
+
+static VkFormatFeatureFlags
+features2_to_features(VkFormatFeatureFlags2 features2)
+{
+   return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS;
+}
+
+static void
+get_drm_format_modifier_properties_list(const struct anv_physical_device *physical_device,
+                                        VkFormat vk_format,
+                                        VkDrmFormatModifierPropertiesListEXT *list)
+{
+   const struct intel_device_info *devinfo = &physical_device->info;
+   const struct anv_format *anv_format = anv_get_format(vk_format);
+
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   isl_drm_modifier_info_for_each(isl_mod_info) {
+      VkFormatFeatureFlags2 features2 =
+         anv_get_image_format_features2(devinfo, vk_format, anv_format,
+                                        VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+                                        isl_mod_info);
+      VkFormatFeatureFlags features = features2_to_features(features2);
+      if (!features)
+         continue;
+
+      uint32_t planes = anv_format->n_planes;
+      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE)
+         ++planes;
+
+      vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, out_props) {
+         *out_props = (VkDrmFormatModifierPropertiesEXT) {
+            .drmFormatModifier = isl_mod_info->modifier,
+            .drmFormatModifierPlaneCount = planes,
+            .drmFormatModifierTilingFeatures = features,
+         };
+      };
+   }
+}
+
+static void
+get_drm_format_modifier_properties_list_2(const struct anv_physical_device *physical_device,
+                                          VkFormat vk_format,
+                                          VkDrmFormatModifierPropertiesList2EXT *list)
+{
+   const struct intel_device_info *devinfo = &physical_device->info;
+   const struct anv_format *anv_format = anv_get_format(vk_format);
+
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   isl_drm_modifier_info_for_each(isl_mod_info) {
+      VkFormatFeatureFlags2 features2 =
+         anv_get_image_format_features2(devinfo, vk_format, anv_format,
+                                        VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+                                        isl_mod_info);
+      if (!features2)
+         continue;
+
+      uint32_t planes = anv_format->n_planes;
+      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE)
+         ++planes;
+
+      vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out, out_props) {
+         *out_props = (VkDrmFormatModifierProperties2EXT) {
+            .drmFormatModifier = isl_mod_info->modifier,
+            .drmFormatModifierPlaneCount = planes,
+            .drmFormatModifierTilingFeatures = features2,
+         };
+      };
+   }
+}
+
+void anv_GetPhysicalDeviceFormatProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    vk_format,
+    VkFormatProperties2*                        pFormatProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   const struct intel_device_info *devinfo = &physical_device->info;
+   const struct anv_format *anv_format = anv_get_format(vk_format);
+
+   assert(pFormatProperties->sType == VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2);
+
+   VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+   linear2 = anv_get_image_format_features2(devinfo, vk_format, anv_format,
+                                            VK_IMAGE_TILING_LINEAR, NULL);
+   optimal2 = anv_get_image_format_features2(devinfo, vk_format, anv_format,
+                                             VK_IMAGE_TILING_OPTIMAL, NULL);
+   buffer2 = get_buffer_format_features2(devinfo, vk_format, anv_format);
+
+   pFormatProperties->formatProperties = (VkFormatProperties) {
+      .linearTilingFeatures = features2_to_features(linear2),
+      .optimalTilingFeatures = features2_to_features(optimal2),
+      .bufferFeatures = features2_to_features(buffer2),
+   };
+
+   vk_foreach_struct(ext, pFormatProperties->pNext) {
+      /* Use unsigned since some cases are not in the VkStructureType enum. */
+      switch ((unsigned)ext->sType) {
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT:
+         get_drm_format_modifier_properties_list(physical_device, vk_format,
+                                                 (void *)ext);
+         break;
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT:
+         get_drm_format_modifier_properties_list_2(physical_device, vk_format,
+                                                   (void *)ext);
+         break;
+
+      case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+         VkFormatProperties3 *props = (VkFormatProperties3 *)ext;
+         props->linearTilingFeatures = linear2;
+         props->optimalTilingFeatures = optimal2;
+         props->bufferFeatures = buffer2;
+         break;
+      }
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+static VkResult
+anv_get_image_format_properties(
+   struct anv_physical_device *physical_device,
+   const VkPhysicalDeviceImageFormatInfo2 *info,
+   VkImageFormatProperties *pImageFormatProperties,
+   VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
+{
+   VkFormatFeatureFlags2 format_feature_flags;
+   VkExtent3D maxExtent;
+   uint32_t maxMipLevels;
+   uint32_t maxArraySize;
+   VkSampleCountFlags sampleCounts;
+   const struct intel_device_info *devinfo = &physical_device->info;
+   const struct anv_format *format = anv_get_format(info->format);
+   const struct isl_drm_modifier_info *isl_mod_info = NULL;
+   const VkImageFormatListCreateInfo *format_list_info =
+      vk_find_struct_const(info->pNext, IMAGE_FORMAT_LIST_CREATE_INFO);
+
+   if (format == NULL)
+      goto unsupported;
+
+   if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *vk_mod_info =
+         vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT);
+
+      isl_mod_info = isl_drm_modifier_get_info(vk_mod_info->drmFormatModifier);
+      if (isl_mod_info == NULL)
+         goto unsupported;
+   }
+
+   assert(format->vk_format == info->format);
+   format_feature_flags = anv_get_image_format_features2(devinfo, info->format,
+                                                         format, info->tiling,
+                                                         isl_mod_info);
+
+   /* Remove the VkFormatFeatureFlags that are incompatible with any declared
+    * image view format. (Removals are more likely to occur when a DRM format
+    * modifier is present).
+    */
+   if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) && format_list_info) {
+      for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) {
+         VkFormat vk_view_format = format_list_info->pViewFormats[i];
+         const struct anv_format *anv_view_format = anv_get_format(vk_view_format);
+         VkFormatFeatureFlags2 view_format_features =
+            anv_get_image_format_features2(devinfo, vk_view_format,
+                                           anv_view_format,
+                                           info->tiling,
+                                           isl_mod_info);
+         format_feature_flags &= view_format_features;
+      }
+   }
+
+   if (!format_feature_flags)
+      goto unsupported;
+
+   switch (info->type) {
+   default:
+      unreachable("bad VkImageType");
+   case VK_IMAGE_TYPE_1D:
+      maxExtent.width = 16384;
+      maxExtent.height = 1;
+      maxExtent.depth = 1;
+      maxMipLevels = 15; /* log2(maxWidth) + 1 */
+      maxArraySize = 2048;
+      sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+      break;
+   case VK_IMAGE_TYPE_2D:
+      /* FINISHME: Does this really differ for cube maps? The documentation
+       * for RENDER_SURFACE_STATE suggests so.
+       */
+      maxExtent.width = 16384;
+      maxExtent.height = 16384;
+      maxExtent.depth = 1;
+      maxMipLevels = 15; /* log2(maxWidth) + 1 */
+      maxArraySize = 2048;
+      sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+      break;
+   case VK_IMAGE_TYPE_3D:
+      maxExtent.width = 2048;
+      maxExtent.height = 2048;
+      maxExtent.depth = 2048;
+      /* Prior to SKL, the mipmaps for 3D surfaces are laid out in a way
+       * that make it impossible to represent in the way that
+       * VkSubresourceLayout expects. Since we can't tell users how to make
+       * sense of them, don't report them as available.
+       */
+      if (devinfo->ver < 9 && info->tiling == VK_IMAGE_TILING_LINEAR)
+         maxMipLevels = 1;
+      else
+         maxMipLevels = 12; /* log2(maxWidth) + 1 */
+      maxArraySize = 1;
+      sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+      break;
+   }
+
+   /* From the Vulkan 1.2.199 spec:
+    *
+    *    "VK_IMAGE_CREATE_EXTENDED_USAGE_BIT specifies that the image can be
+    *    created with usage flags that are not supported for the format the
+    *    image is created with but are supported for at least one format a
+    *    VkImageView created from the image can have."
+    *
+    * If VK_IMAGE_CREATE_EXTENDED_USAGE_BIT is set, views can be created with
+    * different usage than the image so we can't always filter on usage.
+    * There is one exception to this below for storage.
+    */
+   const VkImageUsageFlags image_usage = info->usage;
+   VkImageUsageFlags view_usage = image_usage;
+   if (info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT)
+      view_usage = 0;
+
+   if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      /* We support modifiers only for "simple" (that is, non-array
+       * non-mipmapped single-sample) 2D images.
+       */
+      if (info->type != VK_IMAGE_TYPE_2D) {
+         vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                   "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT "
+                   "requires VK_IMAGE_TYPE_2D");
+         goto unsupported;
+      }
+
+      maxArraySize = 1;
+      maxMipLevels = 1;
+      sampleCounts = VK_SAMPLE_COUNT_1_BIT;
+
+      if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
+          !anv_formats_ccs_e_compatible(devinfo, info->flags, info->format,
+                                        info->tiling, image_usage,
+                                        format_list_info)) {
+         goto unsupported;
+      }
+   }
+
+   /* Our hardware doesn't support 1D compressed textures.
+    *    From the SKL PRM, RENDER_SURFACE_STATE::SurfaceFormat:
+    *    * This field cannot be a compressed (BC*, DXT*, FXT*, ETC*, EAC*) format
+    *       if the Surface Type is SURFTYPE_1D.
+    *    * This field cannot be ASTC format if the Surface Type is SURFTYPE_1D.
+    */
+   if (info->type == VK_IMAGE_TYPE_1D &&
+       isl_format_is_compressed(format->planes[0].isl_format)) {
+       goto unsupported;
+   }
+
+   if (info->tiling == VK_IMAGE_TILING_OPTIMAL &&
+       info->type == VK_IMAGE_TYPE_2D &&
+       (format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                                VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
+       !(image_usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
+       isl_format_supports_multisampling(devinfo, format->planes[0].isl_format)) {
+      sampleCounts = isl_device_get_sample_counts(&physical_device->isl_dev);
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+                                    VK_FORMAT_FEATURE_2_BLIT_SRC_BIT))) {
+         goto unsupported;
+      }
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+                                    VK_FORMAT_FEATURE_2_BLIT_DST_BIT))) {
+         goto unsupported;
+      }
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (image_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      /* Non-power-of-two formats can never be used as storage images.  We
+       * only check plane 0 because there are no YCbCr formats with
+       * non-power-of-two planes.
+       */
+      const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(format->planes[0].isl_format);
+      if (!util_is_power_of_two_or_zero(isl_layout->bpb))
+         goto unsupported;
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (view_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+         goto unsupported;
+      }
+   }
+
+   if (info->flags & VK_IMAGE_CREATE_DISJOINT_BIT) {
+      /* From the Vulkan 1.2.149 spec, VkImageCreateInfo:
+       *
+       *    If format is a multi-planar format, and if imageCreateFormatFeatures
+       *    (as defined in Image Creation Limits) does not contain
+       *    VK_FORMAT_FEATURE_2_DISJOINT_BIT, then flags must not contain
+       *    VK_IMAGE_CREATE_DISJOINT_BIT.
+       */
+      if (format->n_planes > 1 &&
+          !(format_feature_flags & VK_FORMAT_FEATURE_2_DISJOINT_BIT)) {
+         goto unsupported;
+      }
+
+      /* From the Vulkan 1.2.149 spec, VkImageCreateInfo:
+       *
+       * If format is not a multi-planar format, and flags does not include
+       * VK_IMAGE_CREATE_ALIAS_BIT, flags must not contain
+       * VK_IMAGE_CREATE_DISJOINT_BIT.
+       */
+      if (format->n_planes == 1 &&
+          !(info->flags & VK_IMAGE_CREATE_ALIAS_BIT)) {
+          goto unsupported;
+      }
+
+      if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
+          isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+         /* Rejection DISJOINT for consistency with the GL driver. In
+          * eglCreateImage, we require that the dma_buf for the primary surface
+          * and the dma_buf for its aux surface refer to the same bo.
+          */
+         goto unsupported;
+      }
+   }
+
+   if (info->flags & VK_IMAGE_CREATE_ALIAS_BIT) {
+      /* Reject aliasing of images with non-linear DRM format modifiers because:
+       *
+       * 1. For modifiers with compression, we store aux tracking state in
+       *    ANV_IMAGE_MEMORY_BINDING_PRIVATE, which is not aliasable because it's
+       *    not client-bound.
+       *
+       * 2. For tiled modifiers without compression, we may attempt to compress
+       *    them behind the scenes, in which case both the aux tracking state
+       *    and the CCS data are bound to ANV_IMAGE_MEMORY_BINDING_PRIVATE.
+       */
+      if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
+          isl_mod_info->modifier != DRM_FORMAT_MOD_LINEAR) {
+         goto unsupported;
+      }
+   }
+
+   if (image_usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
+      /* Nothing to check. */
+   }
+
+   if (image_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+      /* Ignore this flag because it was removed from the
+       * provisional_I_20150910 header.
+       */
+   }
+
+   /* From the bspec section entitled "Surface Layout and Tiling",
+    * pre-gfx9 has a 2 GB limitation of the size in bytes,
+    * gfx9 and gfx10 have a 256 GB limitation and gfx11+
+    * has a 16 TB limitation.
+    */
+   uint64_t maxResourceSize = 0;
+   if (devinfo->ver < 9)
+      maxResourceSize = (uint64_t) 1 << 31;
+   else if (devinfo->ver < 11)
+      maxResourceSize = (uint64_t) 1 << 38;
+   else
+      maxResourceSize = (uint64_t) 1 << 44;
+
+   *pImageFormatProperties = (VkImageFormatProperties) {
+      .maxExtent = maxExtent,
+      .maxMipLevels = maxMipLevels,
+      .maxArrayLayers = maxArraySize,
+      .sampleCounts = sampleCounts,
+
+      /* FINISHME: Accurately calculate
+       * VkImageFormatProperties::maxResourceSize.
+       */
+      .maxResourceSize = maxResourceSize,
+   };
+
+   if (pYcbcrImageFormatProperties) {
+      pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount =
+         format->n_planes;
+   }
+
+   return VK_SUCCESS;
+
+unsupported:
+   *pImageFormatProperties = (VkImageFormatProperties) {
+      .maxExtent = { 0, 0, 0 },
+      .maxMipLevels = 0,
+      .maxArrayLayers = 0,
+      .sampleCounts = 0,
+      .maxResourceSize = 0,
+   };
+
+   return VK_ERROR_FORMAT_NOT_SUPPORTED;
+}
+
+VkResult anv_GetPhysicalDeviceImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    VkImageTiling                               tiling,
+    VkImageUsageFlags                           usage,
+    VkImageCreateFlags                          createFlags,
+    VkImageFormatProperties*                    pImageFormatProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   const VkPhysicalDeviceImageFormatInfo2 info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .pNext = NULL,
+      .format = format,
+      .type = type,
+      .tiling = tiling,
+      .usage = usage,
+      .flags = createFlags,
+   };
+
+   return anv_get_image_format_properties(physical_device, &info,
+                                          pImageFormatProperties, NULL);
+}
+
+
+/* Supports opaque fd but not dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_only_props = {
+   .externalMemoryFeatures =
+      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+};
+
+/* Supports opaque fd and dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_dma_buf_props = {
+   .externalMemoryFeatures =
+      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties userptr_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes = 0,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties android_buffer_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
+
+static const VkExternalMemoryProperties android_image_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
+VkResult anv_GetPhysicalDeviceImageFormatProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    const VkPhysicalDeviceImageFormatInfo2*     base_info,
+    VkImageFormatProperties2*                   base_props)
+{
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
+   VkExternalImageFormatProperties *external_props = NULL;
+   VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
+   VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
+   VkResult result;
+
+   /* Extract input structs */
+   vk_foreach_struct_const(s, base_info->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
+         external_info = (const void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
+      case VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO:
+         /* anv_get_image_format_properties will handle these */
+         break;
+      case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO:
+         /* Ignore but don't warn */
+         break;
+      default:
+         anv_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
+
+   /* Extract output structs */
+   vk_foreach_struct(s, base_props->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
+         external_props = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+         ycbcr_props = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+         android_usage = (void *) s;
+         break;
+      default:
+         anv_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
+
+   result = anv_get_image_format_properties(physical_device, base_info,
+               &base_props->imageFormatProperties, ycbcr_props);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   bool ahw_supported =
+      physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer;
+
+   if (ahw_supported && android_usage) {
+      android_usage->androidHardwareBufferUsage =
+         anv_ahw_usage_from_vk_usage(base_info->flags,
+                                     base_info->usage);
+
+      /* Limit maxArrayLayers to 1 for AHardwareBuffer based images for now. */
+      base_props->imageFormatProperties.maxArrayLayers = 1;
+   }
+
+   /* From the Vulkan 1.0.42 spec:
+    *
+    *    If handleType is 0, vkGetPhysicalDeviceImageFormatProperties2 will
+    *    behave as if VkPhysicalDeviceExternalImageFormatInfo was not
+    *    present and VkExternalImageFormatProperties will be ignored.
+    */
+   if (external_info && external_info->handleType != 0) {
+      /* Does there exist a method for app and driver to explicitly communicate
+       * to each other the image's memory layout?
+       */
+      bool tiling_has_explicit_layout;
+
+      switch (base_info->tiling) {
+      default:
+         unreachable("bad VkImageTiling");
+      case VK_IMAGE_TILING_LINEAR:
+         /* The app can query the image's memory layout with
+          * vkGetImageSubresourceLayout.
+          */
+         tiling_has_explicit_layout = true;
+         break;
+      case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+         /* The app can provide the image's memory layout with
+          * VkImageDrmFormatModifierExplicitCreateInfoEXT;
+          * or the app can query it with vkGetImageSubresourceLayout.
+          */
+         tiling_has_explicit_layout = true;
+         break;
+      case VK_IMAGE_TILING_OPTIMAL:
+         /* The app can neither query nor provide the image's memory layout. */
+         tiling_has_explicit_layout = false;
+         break;
+      }
+
+      /* Compatibility between tiling and external memory handles
+       * --------------------------------------------------------
+       * When importing or exporting an image, there must exist a method that
+       * enables the app and driver to agree on the image's memory layout. If no
+       * method exists, then we reject image creation here.
+       *
+       * If the memory handle requires matching
+       * VkPhysicalDeviceIDProperties::driverUUID and ::deviceUUID, then the
+       * match-requirement guarantees that all users of the image agree on the
+       * image's memory layout.
+       *
+       * If the memory handle does not require matching
+       * VkPhysicalDeviceIDProperties::driverUUID nor ::deviceUUID, then we
+       * require that the app and driver be able to explicitly communicate to
+       * each other the image's memory layout.
+       *
+       * (For restrictions on driverUUID and deviceUUID, see the Vulkan 1.2.149
+       * spec, Table 73 "External memory handle types").
+       */
+      switch (external_info->handleType) {
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+         if (external_props) {
+            if (tiling_has_explicit_layout) {
+               /* With an explicit memory layout, we don't care which type of fd
+                * the image belongs too. Both OPAQUE_FD and DMA_BUF are
+                * interchangeable here.
+                */
+               external_props->externalMemoryProperties = opaque_fd_dma_buf_props;
+            } else {
+               /* With an implicit memory layout, we must rely on deviceUUID
+                * and driverUUID to determine the layout. Therefore DMA_BUF is
+                * incompatible here.
+                */
+               external_props->externalMemoryProperties = opaque_fd_only_props;
+            }
+         }
+         break;
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+         /* This memory handle has no restrictions on driverUUID nor deviceUUID,
+          * and therefore requires explicit memory layout.
+          */
+         if (!tiling_has_explicit_layout) {
+            result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                               "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
+                               "requires VK_IMAGE_TILING_LINEAR or "
+                               "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+            goto fail;
+         }
+
+         /* With an explicit memory layout, we don't care which type of fd
+          * the image belongs too. Both OPAQUE_FD and DMA_BUF are
+          * interchangeable here.
+          */
+         if (external_props)
+            external_props->externalMemoryProperties = opaque_fd_dma_buf_props;
+         break;
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT:
+         /* This memory handle has no restrictions on driverUUID nor deviceUUID,
+          * and therefore requires explicit memory layout.
+          */
+         if (!tiling_has_explicit_layout) {
+            result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                               "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT "
+                               "requires VK_IMAGE_TILING_LINEAR or "
+                               "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+            goto fail;
+         }
+
+         if (external_props)
+            external_props->externalMemoryProperties = userptr_props;
+         break;
+      case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID:
+         /* This memory handle is magic. The Vulkan spec says it has no
+          * requirements regarding deviceUUID nor driverUUID, but Android still
+          * requires support for VK_IMAGE_TILING_OPTIMAL. Android systems
+          * communicate the image's memory layout through backdoor channels.
+          */
+         if (ahw_supported && external_props) {
+            external_props->externalMemoryProperties = android_image_props;
+            break;
+         }
+         FALLTHROUGH; /* If ahw not supported */
+      default:
+         /* From the Vulkan 1.0.42 spec:
+          *
+          *    If handleType is not compatible with the [parameters] specified
+          *    in VkPhysicalDeviceImageFormatInfo2, then
+          *    vkGetPhysicalDeviceImageFormatProperties2 returns
+          *    VK_ERROR_FORMAT_NOT_SUPPORTED.
+          */
+         result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                            "unsupported VkExternalMemoryTypeFlagBits 0x%x",
+                            external_info->handleType);
+         goto fail;
+      }
+   }
+
+   return VK_SUCCESS;
+
+ fail:
+   if (result == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+      /* From the Vulkan 1.0.42 spec:
+       *
+       *    If the combination of parameters to
+       *    vkGetPhysicalDeviceImageFormatProperties2 is not supported by
+       *    the implementation for use in vkCreateImage, then all members of
+       *    imageFormatProperties will be filled with zero.
+       */
+      base_props->imageFormatProperties = (VkImageFormatProperties) {};
+   }
+
+   return result;
+}
+
+void anv_GetPhysicalDeviceSparseImageFormatProperties(
+    VkPhysicalDevice                            physicalDevice,
+    VkFormat                                    format,
+    VkImageType                                 type,
+    uint32_t                                    samples,
+    VkImageUsageFlags                           usage,
+    VkImageTiling                               tiling,
+    uint32_t*                                   pNumProperties,
+    VkSparseImageFormatProperties*              pProperties)
+{
+   /* Sparse images are not yet supported. */
+   *pNumProperties = 0;
+}
+
+void anv_GetPhysicalDeviceSparseImageFormatProperties2(
+    VkPhysicalDevice                            physicalDevice,
+    const VkPhysicalDeviceSparseImageFormatInfo2* pFormatInfo,
+    uint32_t*                                   pPropertyCount,
+    VkSparseImageFormatProperties2*             pProperties)
+{
+   /* Sparse images are not yet supported. */
+   *pPropertyCount = 0;
+}
+
+void anv_GetPhysicalDeviceExternalBufferProperties(
+    VkPhysicalDevice                             physicalDevice,
+    const VkPhysicalDeviceExternalBufferInfo*    pExternalBufferInfo,
+    VkExternalBufferProperties*                  pExternalBufferProperties)
+{
+   /* The Vulkan 1.0.42 spec says "handleType must be a valid
+    * VkExternalMemoryHandleTypeFlagBits value" in
+    * VkPhysicalDeviceExternalBufferInfo. This differs from
+    * VkPhysicalDeviceExternalImageFormatInfo, which surprisingly permits
+    * handleType == 0.
+    */
+   assert(pExternalBufferInfo->handleType != 0);
+
+   /* All of the current flags are for sparse which we don't support yet.
+    * Even when we do support it, doing sparse on external memory sounds
+    * sketchy.  Also, just disallowing flags is the safe option.
+    */
+   if (pExternalBufferInfo->flags)
+      goto unsupported;
+
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   switch (pExternalBufferInfo->handleType) {
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT:
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
+      pExternalBufferProperties->externalMemoryProperties = opaque_fd_dma_buf_props;
+      return;
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT:
+      pExternalBufferProperties->externalMemoryProperties = userptr_props;
+      return;
+   case VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID:
+      if (physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer) {
+         pExternalBufferProperties->externalMemoryProperties = android_buffer_props;
+         return;
+      }
+      FALLTHROUGH; /* If ahw not supported */
+   default:
+      goto unsupported;
+   }
+
+ unsupported:
+   /* From the Vulkan 1.1.113 spec:
+    *
+    *    compatibleHandleTypes must include at least handleType.
+    */
+   pExternalBufferProperties->externalMemoryProperties =
+      (VkExternalMemoryProperties) {
+         .compatibleHandleTypes = pExternalBufferInfo->handleType,
+      };
+}
+
+VkResult anv_CreateSamplerYcbcrConversion(
+    VkDevice                                    _device,
+    const VkSamplerYcbcrConversionCreateInfo*   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSamplerYcbcrConversion*                   pYcbcrConversion)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_ycbcr_conversion *conversion;
+
+   /* Search for VkExternalFormatANDROID and resolve the format. */
+   struct anv_format *ext_format = NULL;
+   const VkExternalFormatANDROID *ext_info =
+      vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID);
+
+   uint64_t format = ext_info ? ext_info->externalFormat : 0;
+   if (format) {
+      assert(pCreateInfo->format == VK_FORMAT_UNDEFINED);
+      ext_format = (struct anv_format *) (uintptr_t) format;
+   }
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO);
+
+   conversion = vk_object_zalloc(&device->vk, pAllocator, sizeof(*conversion),
+                                 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
+   if (!conversion)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   conversion->format = anv_get_format(pCreateInfo->format);
+   conversion->ycbcr_model = pCreateInfo->ycbcrModel;
+   conversion->ycbcr_range = pCreateInfo->ycbcrRange;
+
+   /* The Vulkan 1.1.95 spec says "When creating an external format conversion,
+    * the value of components if ignored."
+    */
+   if (!ext_format) {
+      conversion->mapping[0] = pCreateInfo->components.r;
+      conversion->mapping[1] = pCreateInfo->components.g;
+      conversion->mapping[2] = pCreateInfo->components.b;
+      conversion->mapping[3] = pCreateInfo->components.a;
+   }
+
+   conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset;
+   conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset;
+   conversion->chroma_filter = pCreateInfo->chromaFilter;
+
+   /* Setup external format. */
+   if (ext_format)
+      conversion->format = ext_format;
+
+   bool has_chroma_subsampled = false;
+   for (uint32_t p = 0; p < conversion->format->n_planes; p++) {
+      if (conversion->format->planes[p].has_chroma &&
+          (conversion->format->planes[p].denominator_scales[0] > 1 ||
+           conversion->format->planes[p].denominator_scales[1] > 1))
+         has_chroma_subsampled = true;
+   }
+   conversion->chroma_reconstruction = has_chroma_subsampled &&
+      (conversion->chroma_offsets[0] == VK_CHROMA_LOCATION_COSITED_EVEN ||
+       conversion->chroma_offsets[1] == VK_CHROMA_LOCATION_COSITED_EVEN);
+
+   *pYcbcrConversion = anv_ycbcr_conversion_to_handle(conversion);
+
+   return VK_SUCCESS;
+}
+
+void anv_DestroySamplerYcbcrConversion(
+    VkDevice                                    _device,
+    VkSamplerYcbcrConversion                    YcbcrConversion,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, YcbcrConversion);
+
+   if (!conversion)
+      return;
+
+   vk_object_free(&device->vk, pAllocator, conversion);
+}
diff --git a/src/intel/vulkan_hasvk/anv_gem.c b/src/intel/vulkan_hasvk/anv_gem.c
new file mode 100644
index 00000000000..d69ebe424ca
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_gem.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+#include "common/intel_defines.h"
+#include "common/intel_gem.h"
+
+/**
+ * Wrapper around DRM_IOCTL_I915_GEM_CREATE.
+ *
+ * Return gem handle, or 0 on failure. Gem handles are never 0.
+ */
+uint32_t
+anv_gem_create(struct anv_device *device, uint64_t size)
+{
+   struct drm_i915_gem_create gem_create = {
+      .size = size,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
+   if (ret != 0) {
+      /* FIXME: What do we do if this fails? */
+      return 0;
+   }
+
+   return gem_create.handle;
+}
+
+void
+anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+{
+   struct drm_gem_close close = {
+      .handle = gem_handle,
+   };
+
+   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+uint32_t
+anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
+                       uint32_t flags, uint32_t num_regions,
+                       struct drm_i915_gem_memory_class_instance *regions)
+{
+   /* Check for invalid flags */
+   assert((flags & ~I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS) == 0);
+
+   struct drm_i915_gem_create_ext_memory_regions ext_regions = {
+      .base = { .name = I915_GEM_CREATE_EXT_MEMORY_REGIONS },
+      .num_regions = num_regions,
+      .regions = (uintptr_t)regions,
+   };
+
+   struct drm_i915_gem_create_ext gem_create = {
+      .size = anv_bo_size,
+      .extensions = (uintptr_t) &ext_regions,
+      .flags = flags,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT,
+                         &gem_create);
+   if (ret != 0) {
+      return 0;
+   }
+
+   return gem_create.handle;
+}
+
+/**
+ * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
+ */
+static void*
+anv_gem_mmap_offset(struct anv_device *device, uint32_t gem_handle,
+                    uint64_t offset, uint64_t size, uint32_t flags)
+{
+   struct drm_i915_gem_mmap_offset gem_mmap = {
+      .handle = gem_handle,
+      .flags = device->info->has_local_mem ? I915_MMAP_OFFSET_FIXED :
+         (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
+   };
+   assert(offset == 0);
+
+   /* Get the fake offset back */
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap);
+   if (ret != 0)
+      return MAP_FAILED;
+
+   /* And map it */
+   void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                    device->fd, gem_mmap.offset);
+   return map;
+}
+
+static void*
+anv_gem_mmap_legacy(struct anv_device *device, uint32_t gem_handle,
+                    uint64_t offset, uint64_t size, uint32_t flags)
+{
+   assert(!device->info->has_local_mem);
+
+   struct drm_i915_gem_mmap gem_mmap = {
+      .handle = gem_handle,
+      .offset = offset,
+      .size = size,
+      .flags = flags,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap);
+   if (ret != 0)
+      return MAP_FAILED;
+
+   return (void *)(uintptr_t) gem_mmap.addr_ptr;
+}
+
+/**
+ * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
+ */
+void*
+anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
+             uint64_t offset, uint64_t size, uint32_t flags)
+{
+   void *map;
+   if (device->physical->has_mmap_offset)
+      map = anv_gem_mmap_offset(device, gem_handle, offset, size, flags);
+   else
+      map = anv_gem_mmap_legacy(device, gem_handle, offset, size, flags);
+
+   if (map != MAP_FAILED)
+      VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
+
+   return map;
+}
+
+/* This is just a wrapper around munmap, but it also notifies valgrind that
+ * this map is no longer valid.  Pair this with anv_gem_mmap().
+ */
+void
+anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
+{
+   VG(VALGRIND_FREELIKE_BLOCK(p, 0));
+   munmap(p, size);
+}
+
+uint32_t
+anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+{
+   struct drm_i915_gem_userptr userptr = {
+      .user_ptr = (__u64)((unsigned long) mem),
+      .user_size = size,
+      .flags = 0,
+   };
+
+   if (device->physical->has_userptr_probe)
+      userptr.flags |= I915_USERPTR_PROBE;
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
+   if (ret == -1)
+      return 0;
+
+   return userptr.handle;
+}
+
+int
+anv_gem_set_caching(struct anv_device *device,
+                    uint32_t gem_handle, uint32_t caching)
+{
+   struct drm_i915_gem_caching gem_caching = {
+      .handle = gem_handle,
+      .caching = caching,
+   };
+
+   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
+}
+
+/**
+ * On error, \a timeout_ns holds the remaining time.
+ */
+int
+anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
+{
+   struct drm_i915_gem_wait wait = {
+      .bo_handle = gem_handle,
+      .timeout_ns = *timeout_ns,
+      .flags = 0,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   *timeout_ns = wait.timeout_ns;
+
+   return ret;
+}
+
+int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   if (execbuf->flags & I915_EXEC_FENCE_OUT)
+      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, execbuf);
+   else
+      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+}
+
+/** Return -1 on error. */
+int
+anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
+{
+   if (!device->info->has_tiling_uapi)
+      return -1;
+
+   struct drm_i915_gem_get_tiling get_tiling = {
+      .handle = gem_handle,
+   };
+
+   /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING
+    * anymore, so we will need another way to get the tiling. Apparently this
+    * is only used in Android code, so we may need some other way to
+    * communicate the tiling mode.
+    */
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
+      assert(!"Failed to get BO tiling");
+      return -1;
+   }
+
+   return get_tiling.tiling_mode;
+}
+
+int
+anv_gem_set_tiling(struct anv_device *device,
+                   uint32_t gem_handle, uint32_t stride, uint32_t tiling)
+{
+   int ret;
+
+   /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So
+    * nothing needs to be done.
+    */
+   if (!device->info->has_tiling_uapi)
+      return 0;
+
+   /* set_tiling overwrites the input on the error path, so we have to open
+    * code intel_ioctl.
+    */
+   do {
+      struct drm_i915_gem_set_tiling set_tiling = {
+         .handle = gem_handle,
+         .tiling_mode = tiling,
+         .stride = stride,
+      };
+
+      ret = ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+int
+anv_gem_get_param(int fd, uint32_t param)
+{
+   int tmp;
+
+   drm_i915_getparam_t gp = {
+      .param = param,
+      .value = &tmp,
+   };
+
+   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+   if (ret == 0)
+      return tmp;
+
+   return 0;
+}
+
+bool
+anv_gem_has_context_priority(int fd, int priority)
+{
+   return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
+                                     priority);
+}
+
+int
+anv_gem_create_context(struct anv_device *device)
+{
+   struct drm_i915_gem_context_create create = { 0 };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+   if (ret == -1)
+      return -1;
+
+   return create.ctx_id;
+}
+
+int
+anv_gem_destroy_context(struct anv_device *device, int context)
+{
+   struct drm_i915_gem_context_destroy destroy = {
+      .ctx_id = context,
+   };
+
+   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
+}
+
+int
+anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = context,
+      .param = param,
+      .value = value,
+   };
+   int err = 0;
+
+   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
+      err = -errno;
+   return err;
+}
+
+int
+anv_gem_context_get_reset_stats(int fd, int context,
+                                uint32_t *active, uint32_t *pending)
+{
+   struct drm_i915_reset_stats stats = {
+      .ctx_id = context,
+   };
+
+   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+   if (ret == 0) {
+      *active = stats.batch_active;
+      *pending = stats.batch_pending;
+   }
+
+   return ret;
+}
+
+int
+anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
+{
+   struct drm_prime_handle args = {
+      .handle = gem_handle,
+      .flags = DRM_CLOEXEC | DRM_RDWR,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
+   if (ret == -1)
+      return -1;
+
+   return args.fd;
+}
+
+uint32_t
+anv_gem_fd_to_handle(struct anv_device *device, int fd)
+{
+   struct drm_prime_handle args = {
+      .fd = fd,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &args);
+   if (ret == -1)
+      return 0;
+
+   return args.handle;
+}
+
+int
+anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
+{
+   struct drm_i915_reg_read args = {
+      .offset = offset
+   };
+
+   int ret = intel_ioctl(fd, DRM_IOCTL_I915_REG_READ, &args);
+
+   *result = args.val;
+   return ret;
+}
+
+struct drm_i915_query_engine_info *
+anv_gem_get_engine_info(int fd)
+{
+   return intel_i915_query_alloc(fd, DRM_I915_QUERY_ENGINE_INFO, NULL);
+}
diff --git a/src/intel/vulkan_hasvk/anv_gem_stubs.c b/src/intel/vulkan_hasvk/anv_gem_stubs.c
new file mode 100644
index 00000000000..52767d6f3c0
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_gem_stubs.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include "util/anon_file.h"
+#include "anv_private.h"
+
+uint32_t
+anv_gem_create(struct anv_device *device, uint64_t size)
+{
+   int fd = os_create_anonymous_file(size, "fake bo");
+   if (fd == -1)
+      return 0;
+
+   assert(fd != 0);
+
+   return fd;
+}
+
+void
+anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+{
+   close(gem_handle);
+}
+
+uint32_t
+anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
+                       uint32_t flags, uint32_t num_regions,
+                       struct drm_i915_gem_memory_class_instance *regions)
+{
+   return 0;
+}
+
+void*
+anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
+             uint64_t offset, uint64_t size, uint32_t flags)
+{
+   /* Ignore flags, as they're specific to I915_GEM_MMAP. */
+   (void) flags;
+
+   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               gem_handle, offset);
+}
+
+/* This is just a wrapper around munmap, but it also notifies valgrind that
+ * this map is no longer valid.  Pair this with anv_gem_mmap().
+ */
+void
+anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
+{
+   munmap(p, size);
+}
+
+uint32_t
+anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+{
+   int fd = os_create_anonymous_file(size, "fake bo");
+   if (fd == -1)
+      return 0;
+
+   assert(fd != 0);
+
+   return fd;
+}
+
+int
+anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
+{
+   return 0;
+}
+
+int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   return 0;
+}
+
+int
+anv_gem_set_tiling(struct anv_device *device,
+                   uint32_t gem_handle, uint32_t stride, uint32_t tiling)
+{
+   return 0;
+}
+
+int
+anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
+{
+   return 0;
+}
+
+int
+anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle,
+                    uint32_t caching)
+{
+   return 0;
+}
+
+int
+anv_gem_get_param(int fd, uint32_t param)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_create_context(struct anv_device *device)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_destroy_context(struct anv_device *device, int context)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
+{
+   unreachable("Unused");
+}
+
+bool
+anv_gem_has_context_priority(int fd, int priority)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_context_get_reset_stats(int fd, int context,
+                                uint32_t *active, uint32_t *pending)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
+{
+   unreachable("Unused");
+}
+
+uint32_t
+anv_gem_fd_to_handle(struct anv_device *device, int fd)
+{
+   unreachable("Unused");
+}
+
+int
+anv_i915_query(int fd, uint64_t query_id, void *buffer,
+               int32_t *buffer_len)
+{
+   unreachable("Unused");
+}
+
+struct drm_i915_query_engine_info *
+anv_gem_get_engine_info(int fd)
+{
+   unreachable("Unused");
+}
+
+int
+anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
+{
+   unreachable("Unused");
+}
diff --git a/src/intel/vulkan_hasvk/anv_genX.h b/src/intel/vulkan_hasvk/anv_genX.h
new file mode 100644
index 00000000000..102514d5e7d
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_genX.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: The header can be included multiple times, from the same file.
+ */
+
+/*
+ * Gen-specific function declarations.  This header must *not* be included
+ * directly.  Instead, it is included multiple times by anv_private.h.
+ *
+ * In this header file, the usual genx() macro is available.
+ */
+
+#ifndef ANV_PRIVATE_H
+#error This file is included by means other than anv_private.h
+#endif
+
+struct intel_sample_positions;
+
+typedef struct VkRenderingSelfDependencyInfoMESA VkRenderingSelfDependencyInfoMESA;
+
+extern const uint32_t genX(vk_to_intel_cullmode)[];
+
+extern const uint32_t genX(vk_to_intel_front_face)[];
+
+extern const uint32_t genX(vk_to_intel_primitive_type)[];
+
+extern const uint32_t genX(vk_to_intel_compare_op)[];
+
+extern const uint32_t genX(vk_to_intel_stencil_op)[];
+
+extern const uint32_t genX(vk_to_intel_logic_op)[];
+
+void genX(init_physical_device_state)(struct anv_physical_device *device);
+
+VkResult genX(init_device_state)(struct anv_device *device);
+
+void genX(init_cps_device_state)(struct anv_device *device);
+
+void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
+                                          const struct isl_surf *surf);
+
+void genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                    int vb_index,
+                                                    struct anv_address vb_address,
+                                                    uint32_t vb_size);
+void genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                         uint32_t access_type,
+                                                         uint64_t vb_used);
+
+void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+                                        unsigned width, unsigned height,
+                                        unsigned scale);
+
+void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
+void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
+
+enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits);
+
+void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+                               struct anv_device *device,
+                               struct anv_batch *batch);
+
+void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state);
+
+void genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+                          struct anv_address dst, struct anv_address src,
+                          uint32_t size);
+
+void genX(emit_l3_config)(struct anv_batch *batch,
+                          const struct anv_device *device,
+                          const struct intel_l3_config *cfg);
+
+void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
+                                const struct intel_l3_config *cfg);
+
+void genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+                                     bool enable);
+
+void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
+                                         const struct anv_image *image,
+                                         VkImageAspectFlagBits aspect,
+                                         enum isl_aux_usage aux_usage,
+                                         uint32_t level,
+                                         uint32_t base_layer,
+                                         uint32_t layer_count);
+
+void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
+
+struct anv_state genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+
+void
+genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
+                     const struct intel_l3_config *l3_config,
+                     VkShaderStageFlags active_stages,
+                     const unsigned entry_size[4],
+                     enum intel_urb_deref_block_size *deref_block_size);
+
+void genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
+                            const struct vk_sample_locations_state *sl);
+
+void genX(emit_sample_pattern)(struct anv_batch *batch,
+                               const struct vk_sample_locations_state *sl);
+
+void genX(emit_shading_rate)(struct anv_batch *batch,
+                             const struct anv_graphics_pipeline *pipeline,
+                             const struct vk_fragment_shading_rate_state *fsr);
+
+void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                                struct anv_address dst, struct anv_address src,
+                                uint32_t size);
+
+void genX(blorp_exec)(struct blorp_batch *batch,
+                      const struct blorp_params *params);
+
+void genX(cmd_emit_timestamp)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              struct anv_address addr,
+                              bool end_of_pipe);
+
+void
+genX(rasterization_mode)(VkPolygonMode raster_mode,
+                         VkLineRasterizationModeEXT line_mode,
+                         float line_width,
+                         uint32_t *api_mode,
+                         bool *msaa_rasterization_enable);
+
+uint32_t
+genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
+                            VkPolygonMode raster_mode);
+
+VkPolygonMode
+genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+                          VkPrimitiveTopology primitive_topology);
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+                             const struct vk_graphics_pipeline_state *state);
+
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline);
+
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline);
diff --git a/src/intel/vulkan_hasvk/anv_image.c b/src/intel/vulkan_hasvk/anv_image.c
new file mode 100644
index 00000000000..6fb8b43c6de
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_image.c
@@ -0,0 +1,2973 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include "drm-uapi/drm_fourcc.h"
+
+#include "anv_private.h"
+#include "util/debug.h"
+#include "vk_util.h"
+#include "util/u_math.h"
+
+#include "vk_format.h"
+
+#define ANV_OFFSET_IMPLICIT UINT64_MAX
+
+static const enum isl_surf_dim
+vk_to_isl_surf_dim[] = {
+   [VK_IMAGE_TYPE_1D] = ISL_SURF_DIM_1D,
+   [VK_IMAGE_TYPE_2D] = ISL_SURF_DIM_2D,
+   [VK_IMAGE_TYPE_3D] = ISL_SURF_DIM_3D,
+};
+
+static uint64_t MUST_CHECK UNUSED
+memory_range_end(struct anv_image_memory_range memory_range)
+{
+   assert(anv_is_aligned(memory_range.offset, memory_range.alignment));
+   return memory_range.offset + memory_range.size;
+}
+
+/**
+ * Get binding for VkImagePlaneMemoryRequirementsInfo,
+ * VkBindImagePlaneMemoryInfo and VkDeviceImageMemoryRequirements.
+ */
+static struct anv_image_binding *
+image_aspect_to_binding(struct anv_image *image, VkImageAspectFlags aspect)
+{
+   uint32_t plane;
+
+   assert(image->disjoint);
+
+   if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      /* Spec requires special aspects for modifier images. */
+      assert(aspect >= VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT &&
+             aspect <= VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+
+      /* We don't advertise DISJOINT for modifiers with aux, and therefore we
+       * don't handle queries of the modifier's "aux plane" here.
+       */
+      assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod));
+
+      plane = aspect - VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT;
+   } else {
+      plane = anv_image_aspect_to_plane(image, aspect);
+   }
+
+   return &image->bindings[ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane];
+}
+
+/**
+ * Extend the memory binding's range by appending a new memory range with `size`
+ * and `alignment` at `offset`. Return the appended range.
+ *
+ * Offset is ignored if ANV_OFFSET_IMPLICIT.
+ *
+ * The given binding must not be ANV_IMAGE_MEMORY_BINDING_MAIN. The function
+ * converts to MAIN as needed.
+ */
+static VkResult MUST_CHECK
+image_binding_grow(const struct anv_device *device,
+                   struct anv_image *image,
+                   enum anv_image_memory_binding binding,
+                   uint64_t offset,
+                   uint64_t size,
+                   uint32_t alignment,
+                   struct anv_image_memory_range *out_range)
+{
+   /* We overwrite 'offset' but need to remember if it was implicit. */
+   const bool has_implicit_offset = (offset == ANV_OFFSET_IMPLICIT);
+
+   assert(size > 0);
+   assert(util_is_power_of_two_or_zero(alignment));
+
+   switch (binding) {
+   case ANV_IMAGE_MEMORY_BINDING_MAIN:
+      /* The caller must not pre-translate BINDING_PLANE_i to BINDING_MAIN. */
+      unreachable("ANV_IMAGE_MEMORY_BINDING_MAIN");
+   case ANV_IMAGE_MEMORY_BINDING_PLANE_0:
+   case ANV_IMAGE_MEMORY_BINDING_PLANE_1:
+   case ANV_IMAGE_MEMORY_BINDING_PLANE_2:
+      if (!image->disjoint)
+         binding = ANV_IMAGE_MEMORY_BINDING_MAIN;
+      break;
+   case ANV_IMAGE_MEMORY_BINDING_PRIVATE:
+      assert(offset == ANV_OFFSET_IMPLICIT);
+      break;
+   case ANV_IMAGE_MEMORY_BINDING_END:
+      unreachable("ANV_IMAGE_MEMORY_BINDING_END");
+   }
+
+   struct anv_image_memory_range *container =
+      &image->bindings[binding].memory_range;
+
+   if (has_implicit_offset) {
+      offset = align_u64(container->offset + container->size, alignment);
+   } else {
+      /* Offset must be validated because it comes from
+       * VkImageDrmFormatModifierExplicitCreateInfoEXT.
+       */
+      if (unlikely(!anv_is_aligned(offset, alignment))) {
+         return vk_errorf(device,
+                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
+                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
+                          "pPlaneLayouts[]::offset is misaligned");
+      }
+
+      /* We require that surfaces be added in memory-order. This simplifies the
+       * layout validation required by
+       * VkImageDrmFormatModifierExplicitCreateInfoEXT,
+       */
+      if (unlikely(offset < container->size)) {
+         return vk_errorf(device,
+                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
+                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
+                          "pPlaneLayouts[]::offset is too small");
+      }
+   }
+
+   if (__builtin_add_overflow(offset, size, &container->size)) {
+      if (has_implicit_offset) {
+         assert(!"overflow");
+         return vk_errorf(device, VK_ERROR_UNKNOWN,
+                          "internal error: overflow in %s", __func__);
+      } else {
+         return vk_errorf(device,
+                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
+                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
+                          "pPlaneLayouts[]::offset is too large");
+      }
+   }
+
+   container->alignment = MAX2(container->alignment, alignment);
+
+   *out_range = (struct anv_image_memory_range) {
+      .binding = binding,
+      .offset = offset,
+      .size = size,
+      .alignment = alignment,
+   };
+
+   return VK_SUCCESS;
+}
+
+/**
+ * Adjust range 'a' to contain range 'b'.
+ *
+ * For simplicity's sake, the offset of 'a' must be 0 and remains 0.
+ * If 'a' and 'b' target different bindings, then no merge occurs.
+ */
+static void
+memory_range_merge(struct anv_image_memory_range *a,
+                   const struct anv_image_memory_range b)
+{
+   if (b.size == 0)
+      return;
+
+   if (a->binding != b.binding)
+      return;
+
+   assert(a->offset == 0);
+   assert(anv_is_aligned(a->offset, a->alignment));
+   assert(anv_is_aligned(b.offset, b.alignment));
+
+   a->alignment = MAX2(a->alignment, b.alignment);
+   a->size = MAX2(a->size, b.offset + b.size);
+}
+
+static isl_surf_usage_flags_t
+choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
+                      VkImageUsageFlags vk_usage,
+                      isl_surf_usage_flags_t isl_extra_usage,
+                      VkImageAspectFlagBits aspect)
+{
+   isl_surf_usage_flags_t isl_usage = isl_extra_usage;
+
+   if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
+      isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
+      isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
+      isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+   if (vk_usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)
+      isl_usage |= ISL_SURF_USAGE_CPB_BIT;
+
+   if (vk_create_flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
+      isl_usage |= ISL_SURF_USAGE_CUBE_BIT;
+
+   /* Even if we're only using it for transfer operations, clears to depth and
+    * stencil images happen as depth and stencil so they need the right ISL
+    * usage bits or else things will fall apart.
+    */
+   switch (aspect) {
+   case VK_IMAGE_ASPECT_DEPTH_BIT:
+      isl_usage |= ISL_SURF_USAGE_DEPTH_BIT;
+      break;
+   case VK_IMAGE_ASPECT_STENCIL_BIT:
+      isl_usage |= ISL_SURF_USAGE_STENCIL_BIT;
+      break;
+   case VK_IMAGE_ASPECT_COLOR_BIT:
+   case VK_IMAGE_ASPECT_PLANE_0_BIT:
+   case VK_IMAGE_ASPECT_PLANE_1_BIT:
+   case VK_IMAGE_ASPECT_PLANE_2_BIT:
+      break;
+   default:
+      unreachable("bad VkImageAspect");
+   }
+
+   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      /* blorp implements transfers by sampling from the source image. */
+      isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+   }
+
+   if (vk_usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT &&
+       aspect == VK_IMAGE_ASPECT_COLOR_BIT) {
+      /* blorp implements transfers by rendering into the destination image.
+       * Only request this with color images, as we deal with depth/stencil
+       * formats differently. */
+      isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+   }
+
+   return isl_usage;
+}
+
+static isl_tiling_flags_t
+choose_isl_tiling_flags(const struct intel_device_info *devinfo,
+                        const struct anv_image_create_info *anv_info,
+                        const struct isl_drm_modifier_info *isl_mod_info,
+                        bool legacy_scanout)
+{
+   const VkImageCreateInfo *base_info = anv_info->vk_info;
+   isl_tiling_flags_t flags = 0;
+
+   assert((isl_mod_info != NULL) ==
+          (base_info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT));
+
+   switch (base_info->tiling) {
+   default:
+      unreachable("bad VkImageTiling");
+   case VK_IMAGE_TILING_OPTIMAL:
+      flags = ISL_TILING_ANY_MASK;
+      break;
+   case VK_IMAGE_TILING_LINEAR:
+      flags = ISL_TILING_LINEAR_BIT;
+      break;
+   case VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT:
+      flags = 1 << isl_mod_info->tiling;
+   }
+
+   if (anv_info->isl_tiling_flags) {
+      assert(isl_mod_info == NULL);
+      flags &= anv_info->isl_tiling_flags;
+   }
+
+   if (legacy_scanout) {
+      isl_tiling_flags_t legacy_mask = ISL_TILING_LINEAR_BIT;
+      if (devinfo->has_tiling_uapi)
+         legacy_mask |= ISL_TILING_X_BIT;
+      flags &= legacy_mask;
+   }
+
+   assert(flags);
+
+   return flags;
+}
+
+/**
+ * Add the surface to the binding at the given offset.
+ *
+ * \see image_binding_grow()
+ */
+static VkResult MUST_CHECK
+add_surface(struct anv_device *device,
+            struct anv_image *image,
+            struct anv_surface *surf,
+            enum anv_image_memory_binding binding,
+            uint64_t offset)
+{
+   /* isl surface must be initialized */
+   assert(surf->isl.size_B > 0);
+
+   return image_binding_grow(device, image, binding, offset,
+                             surf->isl.size_B,
+                             surf->isl.alignment_B,
+                             &surf->memory_range);
+}
+
+/**
+ * Do hardware limitations require the image plane to use a shadow surface?
+ *
+ * If hardware limitations force us to use a shadow surface, then the same
+ * limitations may also constrain the tiling of the primary surface; therefore
+ * parameter @a inout_primary_tiling_flags.
+ *
+ * If the image plane is a separate stencil plane and if the user provided
+ * VkImageStencilUsageCreateInfo, then @a usage must be stencilUsage.
+ *
+ * @see anv_image::planes[]::shadow_surface
+ */
+static bool
+anv_image_plane_needs_shadow_surface(const struct intel_device_info *devinfo,
+                                     struct anv_format_plane plane_format,
+                                     VkImageTiling vk_tiling,
+                                     VkImageUsageFlags vk_plane_usage,
+                                     VkImageCreateFlags vk_create_flags,
+                                     isl_tiling_flags_t *inout_primary_tiling_flags)
+{
+   if (devinfo->ver <= 8 &&
+       (vk_create_flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) &&
+       vk_tiling == VK_IMAGE_TILING_OPTIMAL) {
+      /* We must fallback to a linear surface because we may not be able to
+       * correctly handle the offsets if tiled. (On gfx9,
+       * RENDER_SURFACE_STATE::X/Y Offset are sufficient). To prevent garbage
+       * performance while texturing, we maintain a tiled shadow surface.
+       */
+      assert(isl_format_is_compressed(plane_format.isl_format));
+
+      if (inout_primary_tiling_flags) {
+         *inout_primary_tiling_flags = ISL_TILING_LINEAR_BIT;
+      }
+
+      return true;
+   }
+
+   if (devinfo->ver <= 7 &&
+       plane_format.aspect == VK_IMAGE_ASPECT_STENCIL_BIT &&
+       (vk_plane_usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))) {
+      /* gfx7 can't sample from W-tiled surfaces. */
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+can_fast_clear_with_non_zero_color(const struct intel_device_info *devinfo,
+                                   const struct anv_image *image,
+                                   uint32_t plane,
+                                   const VkImageFormatListCreateInfo *fmt_list)
+{
+   /* If we don't have an AUX surface where fast clears apply, we can return
+    * early.
+    */
+   if (!isl_aux_usage_has_fast_clears(image->planes[plane].aux_usage))
+      return false;
+
+   /* On TGL, if a block of fragment shader outputs match the surface's clear
+    * color, the HW may convert them to fast-clears (see HSD 14010672564).
+    * This can lead to rendering corruptions if not handled properly. We
+    * restrict the clear color to zero to avoid issues that can occur with:
+    *     - Texture view rendering (including blorp_copy calls)
+    *     - Images with multiple levels or array layers
+    */
+   if (devinfo->ver >= 12 &&
+       image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E)
+      return false;
+
+   /* Non mutable image, we can fast clear with any color supported by HW.
+    */
+   if (!(image->vk.create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+      return true;
+
+   /* Mutable image with no format list, we have to assume all formats */
+   if (!fmt_list || fmt_list->viewFormatCount == 0)
+      return false;
+
+   enum isl_format img_format = image->planes[plane].primary_surface.isl.format;
+
+   /* Check bit compatibility for clear color components */
+   for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+      struct anv_format_plane view_format_plane =
+         anv_get_format_plane(devinfo, fmt_list->pViewFormats[i],
+                              plane, image->vk.tiling);
+
+      enum isl_format view_format = view_format_plane.isl_format;
+
+      if (!isl_formats_have_same_bits_per_channel(img_format, view_format))
+         return false;
+
+      /* Switching between any of those format types on Gfx7/8 will cause
+       * problems https://gitlab.freedesktop.org/mesa/mesa/-/issues/1711
+       */
+      if (devinfo->ver <= 8) {
+         if (isl_format_has_float_channel(img_format) &&
+             !isl_format_has_float_channel(view_format))
+            return false;
+
+         if (isl_format_has_int_channel(img_format) &&
+             !isl_format_has_int_channel(view_format))
+            return false;
+
+         if (isl_format_has_unorm_channel(img_format) &&
+             !isl_format_has_unorm_channel(view_format))
+            return false;
+
+         if (isl_format_has_snorm_channel(img_format) &&
+             !isl_format_has_snorm_channel(view_format))
+            return false;
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Return true if the storage image could be used with atomics.
+ *
+ * If the image was created with an explicit format, we check it for typed
+ * atomic support.  If MUTABLE_FORMAT_BIT is set, then we check the optional
+ * format list, seeing if /any/ of the formats support typed atomics.  If no
+ * list is supplied, we fall back to using the bpb, as the application could
+ * make an image view with a format that does use atomics.
+ */
+static bool
+storage_image_format_supports_atomic(const struct intel_device_info *devinfo,
+                                     VkImageCreateFlags create_flags,
+                                     enum isl_format format,
+                                     VkImageTiling vk_tiling,
+                                     const VkImageFormatListCreateInfo *fmt_list)
+{
+   if (isl_format_supports_typed_atomics(devinfo, format))
+      return true;
+
+   if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+      return false;
+
+   if (fmt_list) {
+      for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+         enum isl_format view_format =
+            anv_get_isl_format(devinfo, fmt_list->pViewFormats[i],
+                               VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+
+         if (isl_format_supports_typed_atomics(devinfo, view_format))
+            return true;
+      }
+
+      return false;
+   }
+
+   /* No explicit format list.  Any 16/32/64bpp format could be used with atomics. */
+   unsigned bpb = isl_format_get_layout(format)->bpb;
+   return bpb == 16 || bpb == 32 || bpb == 64;
+}
+
+static enum isl_format
+anv_get_isl_format_with_usage(const struct intel_device_info *devinfo,
+                              VkFormat vk_format,
+                              VkImageAspectFlagBits vk_aspect,
+                              VkImageUsageFlags vk_usage,
+                              VkImageTiling vk_tiling)
+{
+   assert(util_bitcount(vk_usage) == 1);
+   struct anv_format_plane format =
+      anv_get_format_aspect(devinfo, vk_format, vk_aspect,
+                            vk_tiling);
+
+   if ((vk_usage == VK_IMAGE_USAGE_STORAGE_BIT) &&
+       isl_is_storage_image_format(format.isl_format)) {
+      enum isl_format lowered_format =
+         isl_lower_storage_image_format(devinfo, format.isl_format);
+
+      /* If we lower the format, we should ensure either they both match in
+       * bits per channel or that there is no swizzle, because we can't use
+       * the swizzle for a different bit pattern.
+       */
+      assert(isl_formats_have_same_bits_per_channel(lowered_format,
+                                                    format.isl_format) ||
+             isl_swizzle_is_identity(format.swizzle));
+
+      format.isl_format = lowered_format;
+   }
+
+   return format.isl_format;
+}
+
+static bool
+formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+                         VkImageCreateFlags create_flags,
+                         enum isl_format format, VkImageTiling vk_tiling,
+                         VkImageUsageFlags vk_usage,
+                         const VkImageFormatListCreateInfo *fmt_list)
+{
+   if (!isl_format_supports_ccs_e(devinfo, format))
+      return false;
+
+   /* For images created without MUTABLE_FORMAT_BIT set, we know that they will
+    * always be used with the original format. In particular, they will always
+    * be used with a format that supports color compression.
+    */
+   if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+      return true;
+
+   if (!fmt_list || fmt_list->viewFormatCount == 0)
+      return false;
+
+   for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+      enum isl_format view_format =
+         anv_get_isl_format_with_usage(devinfo, fmt_list->pViewFormats[i],
+                                       VK_IMAGE_ASPECT_COLOR_BIT, vk_usage,
+                                       vk_tiling);
+
+      if (!isl_formats_are_ccs_e_compatible(devinfo, format, view_format))
+         return false;
+   }
+
+   return true;
+}
+
+bool
+anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+                             VkImageCreateFlags create_flags,
+                             VkFormat vk_format, VkImageTiling vk_tiling,
+                             VkImageUsageFlags vk_usage,
+                             const VkImageFormatListCreateInfo *fmt_list)
+{
+   enum isl_format format =
+      anv_get_isl_format_with_usage(devinfo, vk_format,
+                                    VK_IMAGE_ASPECT_COLOR_BIT,
+                                    VK_IMAGE_USAGE_SAMPLED_BIT, vk_tiling);
+
+   if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+                                 VK_IMAGE_USAGE_SAMPLED_BIT, fmt_list))
+      return false;
+
+   if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (devinfo->verx10 < 125)
+         return false;
+
+      enum isl_format lower_format =
+         anv_get_isl_format_with_usage(devinfo, vk_format,
+                                       VK_IMAGE_ASPECT_COLOR_BIT,
+                                       VK_IMAGE_USAGE_STORAGE_BIT, vk_tiling);
+
+      if (!isl_formats_are_ccs_e_compatible(devinfo, format, lower_format))
+         return false;
+
+      if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+                                    VK_IMAGE_USAGE_STORAGE_BIT, fmt_list))
+         return false;
+
+      /* Disable compression when surface can be potentially used for atomic
+       * operation.
+       */
+      if (storage_image_format_supports_atomic(devinfo, create_flags, format,
+                                               vk_tiling, fmt_list))
+         return false;
+   }
+
+   return true;
+}
+
+/**
+ * For color images that have an auxiliary surface, request allocation for an
+ * additional buffer that mainly stores fast-clear values. Use of this buffer
+ * allows us to access the image's subresources while being aware of their
+ * fast-clear values in non-trivial cases (e.g., outside of a render pass in
+ * which a fast clear has occurred).
+ *
+ * In order to avoid having multiple clear colors for a single plane of an
+ * image (hence a single RENDER_SURFACE_STATE), we only allow fast-clears on
+ * the first slice (level 0, layer 0).  At the time of our testing (Jan 17,
+ * 2018), there were no known applications which would benefit from fast-
+ * clearing more than just the first slice.
+ *
+ * The fast clear portion of the image is laid out in the following order:
+ *
+ *  * 1 or 4 dwords (depending on hardware generation) for the clear color
+ *  * 1 dword for the anv_fast_clear_type of the clear color
+ *  * On gfx9+, 1 dword per level and layer of the image (3D levels count
+ *    multiple layers) in level-major order for compression state.
+ *
+ * For the purpose of discoverability, the algorithm used to manage
+ * compression and fast-clears is described here:
+ *
+ *  * On a transition from UNDEFINED or PREINITIALIZED to a defined layout,
+ *    all of the values in the fast clear portion of the image are initialized
+ *    to default values.
+ *
+ *  * On fast-clear, the clear value is written into surface state and also
+ *    into the buffer and the fast clear type is set appropriately.  Both
+ *    setting the fast-clear value in the buffer and setting the fast-clear
+ *    type happen from the GPU using MI commands.
+ *
+ *  * Whenever a render or blorp operation is performed with CCS_E, we call
+ *    genX(cmd_buffer_mark_image_written) to set the compression state to
+ *    true (which is represented by UINT32_MAX).
+ *
+ *  * On pipeline barrier transitions, the worst-case transition is computed
+ *    from the image layouts.  The command streamer inspects the fast clear
+ *    type and compression state dwords and constructs a predicate.  The
+ *    worst-case resolve is performed with the given predicate and the fast
+ *    clear and compression state is set accordingly.
+ *
+ * See anv_layout_to_aux_usage and anv_layout_to_fast_clear_type functions for
+ * details on exactly what is allowed in what layouts.
+ *
+ * On gfx7-9, we do not have a concept of indirect clear colors in hardware.
+ * In order to deal with this, we have to do some clear color management.
+ *
+ *  * For LOAD_OP_LOAD at the top of a renderpass, we have to copy the clear
+ *    value from the buffer into the surface state with MI commands.
+ *
+ *  * For any blorp operations, we pass the address to the clear value into
+ *    blorp and it knows to copy the clear color.
+ */
+static VkResult MUST_CHECK
+add_aux_state_tracking_buffer(struct anv_device *device,
+                              struct anv_image *image,
+                              uint32_t plane)
+{
+   assert(image && device);
+   assert(image->planes[plane].aux_usage != ISL_AUX_USAGE_NONE &&
+          image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV |
+                               VK_IMAGE_ASPECT_DEPTH_BIT));
+
+   const unsigned clear_color_state_size = device->info->ver >= 10 ?
+      device->isl_dev.ss.clear_color_state_size :
+      device->isl_dev.ss.clear_value_size;
+
+   /* Clear color and fast clear type */
+   unsigned state_size = clear_color_state_size + 4;
+
+   /* We only need to track compression on CCS_E surfaces. */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         for (uint32_t l = 0; l < image->vk.mip_levels; l++)
+            state_size += anv_minify(image->vk.extent.depth, l) * 4;
+      } else {
+         state_size += image->vk.mip_levels * image->vk.array_layers * 4;
+      }
+   }
+
+   enum anv_image_memory_binding binding =
+      ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
+
+   /* If an auxiliary surface is used for an externally-shareable image,
+    * we have to hide this from the memory of the image since other
+    * processes with access to the memory may not be aware of it or of
+    * its current state. So put that auxiliary data into a separate
+    * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+    */
+   if (anv_image_is_externally_shared(image)) {
+      binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+   }
+
+   /* We believe that 256B alignment may be sufficient, but we choose 4K due to
+    * lack of testing.  And MI_LOAD/STORE operations require dword-alignment.
+    */
+   return image_binding_grow(device, image, binding,
+                             ANV_OFFSET_IMPLICIT, state_size, 4096,
+                             &image->planes[plane].fast_clear_memory_range);
+}
+
+/**
+ * The return code indicates whether creation of the VkImage should continue
+ * or fail, not whether the creation of the aux surface succeeded.  If the aux
+ * surface is not required (for example, by neither hardware nor DRM format
+ * modifier), then this may return VK_SUCCESS when creation of the aux surface
+ * fails.
+ *
+ * @param offset See add_surface()
+ */
+static VkResult
+add_aux_surface_if_supported(struct anv_device *device,
+                             struct anv_image *image,
+                             uint32_t plane,
+                             struct anv_format_plane plane_format,
+                             const VkImageFormatListCreateInfo *fmt_list,
+                             uint64_t offset,
+                             uint32_t stride,
+                             isl_surf_usage_flags_t isl_extra_usage_flags)
+{
+   VkImageAspectFlags aspect = plane_format.aspect;
+   VkResult result;
+   bool ok;
+
+   /* The aux surface must not be already added. */
+   assert(!anv_surface_is_valid(&image->planes[plane].aux_surface));
+
+   if ((isl_extra_usage_flags & ISL_SURF_USAGE_DISABLE_AUX_BIT))
+      return VK_SUCCESS;
+
+   if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
+      /* We don't advertise that depth buffers could be used as storage
+       * images.
+       */
+       assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT));
+
+      /* Allow the user to control HiZ enabling. Disable by default on gfx7
+       * because resolves are not currently implemented pre-BDW.
+       */
+      if (!(image->vk.usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+         /* It will never be used as an attachment, HiZ is pointless. */
+         return VK_SUCCESS;
+      }
+
+      if (device->info->ver == 7) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Implement gfx7 HiZ");
+         return VK_SUCCESS;
+      }
+
+      if (image->vk.mip_levels > 1) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Enable multi-LOD HiZ");
+         return VK_SUCCESS;
+      }
+
+      if (device->info->ver == 8 && image->vk.samples > 1) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+                       "Enable gfx8 multisampled HiZ");
+         return VK_SUCCESS;
+      }
+
+      if (INTEL_DEBUG(DEBUG_NO_HIZ))
+         return VK_SUCCESS;
+
+      ok = isl_surf_get_hiz_surf(&device->isl_dev,
+                                 &image->planes[plane].primary_surface.isl,
+                                 &image->planes[plane].aux_surface.isl);
+      if (!ok)
+         return VK_SUCCESS;
+
+      if (!isl_surf_supports_ccs(&device->isl_dev,
+                                 &image->planes[plane].primary_surface.isl,
+                                 &image->planes[plane].aux_surface.isl)) {
+         image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ;
+      } else if (image->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                                    VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) &&
+                 image->vk.samples == 1) {
+         /* If it's used as an input attachment or a texture and it's
+          * single-sampled (this is a requirement for HiZ+CCS write-through
+          * mode), use write-through mode so that we don't need to resolve
+          * before texturing.  This will make depth testing a bit slower but
+          * texturing faster.
+          *
+          * TODO: This is a heuristic trade-off; we haven't tuned it at all.
+          */
+         assert(device->info->ver >= 12);
+         image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS_WT;
+      } else {
+         assert(device->info->ver >= 12);
+         image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS;
+      }
+
+      result = add_surface(device, image, &image->planes[plane].aux_surface,
+                           ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
+                           ANV_OFFSET_IMPLICIT);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (image->planes[plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT)
+         return add_aux_state_tracking_buffer(device, image, plane);
+   } else if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
+
+      if (INTEL_DEBUG(DEBUG_NO_CCS))
+         return VK_SUCCESS;
+
+      if (!isl_surf_supports_ccs(&device->isl_dev,
+                                 &image->planes[plane].primary_surface.isl,
+                                 NULL))
+         return VK_SUCCESS;
+
+      image->planes[plane].aux_usage = ISL_AUX_USAGE_STC_CCS;
+   } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples == 1) {
+      if (image->n_planes != 1) {
+         /* Multiplanar images seem to hit a sampler bug with CCS and R16G16
+          * format. (Putting the clear state a page/4096bytes further fixes
+          * the issue).
+          */
+         return VK_SUCCESS;
+      }
+
+      if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT)) {
+         /* The image may alias a plane of a multiplanar image. Above we ban
+          * CCS on multiplanar images.
+          *
+          * We must also reject aliasing of any image that uses
+          * ANV_IMAGE_MEMORY_BINDING_PRIVATE. Since we're already rejecting all
+          * aliasing here, there's no need to further analyze if the image needs
+          * a private binding.
+          */
+         return VK_SUCCESS;
+      }
+
+      if (INTEL_DEBUG(DEBUG_NO_CCS))
+         return VK_SUCCESS;
+
+      ok = isl_surf_get_ccs_surf(&device->isl_dev,
+                                 &image->planes[plane].primary_surface.isl,
+                                 NULL,
+                                 &image->planes[plane].aux_surface.isl,
+                                 stride);
+      if (!ok)
+         return VK_SUCCESS;
+
+      /* Choose aux usage */
+      if (anv_formats_ccs_e_compatible(device->info, image->vk.create_flags,
+                                       image->vk.format, image->vk.tiling,
+                                       image->vk.usage, fmt_list)) {
+         image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E;
+      } else if (device->info->ver >= 12) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+                       "The CCS_D aux mode is not yet handled on "
+                       "Gfx12+. Not allocating a CCS buffer.");
+         image->planes[plane].aux_surface.isl.size_B = 0;
+         return VK_SUCCESS;
+      } else {
+         image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D;
+      }
+
+      if (!device->physical->has_implicit_ccs) {
+         enum anv_image_memory_binding binding =
+            ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
+
+         if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID &&
+             !isl_drm_modifier_has_aux(image->vk.drm_format_mod))
+            binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+
+         result = add_surface(device, image, &image->planes[plane].aux_surface,
+                              binding, offset);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      return add_aux_state_tracking_buffer(device, image, plane);
+   } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples > 1) {
+      assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT));
+      ok = isl_surf_get_mcs_surf(&device->isl_dev,
+                                 &image->planes[plane].primary_surface.isl,
+                                 &image->planes[plane].aux_surface.isl);
+      if (!ok)
+         return VK_SUCCESS;
+
+      image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS;
+
+      result = add_surface(device, image, &image->planes[plane].aux_surface,
+                           ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
+                           ANV_OFFSET_IMPLICIT);
+      if (result != VK_SUCCESS)
+         return result;
+
+      return add_aux_state_tracking_buffer(device, image, plane);
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+add_shadow_surface(struct anv_device *device,
+                   struct anv_image *image,
+                   uint32_t plane,
+                   struct anv_format_plane plane_format,
+                   uint32_t stride,
+                   VkImageUsageFlags vk_plane_usage)
+{
+   ASSERTED bool ok;
+
+   ok = isl_surf_init(&device->isl_dev,
+                      &image->planes[plane].shadow_surface.isl,
+                     .dim = vk_to_isl_surf_dim[image->vk.image_type],
+                     .format = plane_format.isl_format,
+                     .width = image->vk.extent.width,
+                     .height = image->vk.extent.height,
+                     .depth = image->vk.extent.depth,
+                     .levels = image->vk.mip_levels,
+                     .array_len = image->vk.array_layers,
+                     .samples = image->vk.samples,
+                     .min_alignment_B = 0,
+                     .row_pitch_B = stride,
+                     .usage = ISL_SURF_USAGE_TEXTURE_BIT |
+                              (vk_plane_usage & ISL_SURF_USAGE_CUBE_BIT),
+                     .tiling_flags = ISL_TILING_ANY_MASK);
+
+   /* isl_surf_init() will fail only if provided invalid input. Invalid input
+    * here is illegal in Vulkan.
+    */
+   assert(ok);
+
+   return add_surface(device, image, &image->planes[plane].shadow_surface,
+                      ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
+                      ANV_OFFSET_IMPLICIT);
+}
+
+/**
+ * Initialize the anv_image::*_surface selected by \a aspect. Then update the
+ * image's memory requirements (that is, the image's size and alignment).
+ *
+ * @param offset See add_surface()
+ */
+static VkResult
+add_primary_surface(struct anv_device *device,
+                    struct anv_image *image,
+                    uint32_t plane,
+                    struct anv_format_plane plane_format,
+                    uint64_t offset,
+                    uint32_t stride,
+                    isl_tiling_flags_t isl_tiling_flags,
+                    isl_surf_usage_flags_t isl_usage)
+{
+   struct anv_surface *anv_surf = &image->planes[plane].primary_surface;
+   bool ok;
+
+   ok = isl_surf_init(&device->isl_dev, &anv_surf->isl,
+      .dim = vk_to_isl_surf_dim[image->vk.image_type],
+      .format = plane_format.isl_format,
+      .width = image->vk.extent.width / plane_format.denominator_scales[0],
+      .height = image->vk.extent.height / plane_format.denominator_scales[1],
+      .depth = image->vk.extent.depth,
+      .levels = image->vk.mip_levels,
+      .array_len = image->vk.array_layers,
+      .samples = image->vk.samples,
+      .min_alignment_B = 0,
+      .row_pitch_B = stride,
+      .usage = isl_usage,
+      .tiling_flags = isl_tiling_flags);
+
+   if (!ok) {
+      /* TODO: Should return
+       * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT in come cases.
+       */
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   }
+
+   image->planes[plane].aux_usage = ISL_AUX_USAGE_NONE;
+
+   return add_surface(device, image, anv_surf,
+                      ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane, offset);
+}
+
+#ifndef NDEBUG
+static bool MUST_CHECK
+memory_range_is_aligned(struct anv_image_memory_range memory_range)
+{
+   return anv_is_aligned(memory_range.offset, memory_range.alignment);
+}
+
+static bool MUST_CHECK
+memory_ranges_equal(struct anv_image_memory_range a,
+                    struct anv_image_memory_range b)
+{
+   return a.binding == b.binding &&
+          a.offset == b.offset &&
+          a.size == b.size &&
+          a.alignment == b.alignment;
+}
+#endif
+
+struct check_memory_range_params {
+   struct anv_image_memory_range *accum_ranges;
+   const struct anv_surface *test_surface;
+   const struct anv_image_memory_range *test_range;
+   enum anv_image_memory_binding expect_binding;
+};
+
+#define check_memory_range(...) \
+   check_memory_range_s(&(struct check_memory_range_params) { __VA_ARGS__ })
+
+static void UNUSED
+check_memory_range_s(const struct check_memory_range_params *p)
+{
+   assert((p->test_surface == NULL) != (p->test_range == NULL));
+
+   const struct anv_image_memory_range *test_range =
+      p->test_range ?: &p->test_surface->memory_range;
+
+   struct anv_image_memory_range *accum_range =
+      &p->accum_ranges[p->expect_binding];
+
+   assert(test_range->binding == p->expect_binding);
+   assert(test_range->offset >= memory_range_end(*accum_range));
+   assert(memory_range_is_aligned(*test_range));
+
+   if (p->test_surface) {
+      assert(anv_surface_is_valid(p->test_surface));
+      assert(p->test_surface->memory_range.alignment ==
+             p->test_surface->isl.alignment_B);
+   }
+
+   memory_range_merge(accum_range, *test_range);
+}
+
+/**
+ * Validate the image's memory bindings *after* all its surfaces and memory
+ * ranges are final.
+ *
+ * For simplicity's sake, we do not validate free-form layout of the image's
+ * memory bindings. We validate the layout described in the comments of struct
+ * anv_image.
+ */
+static void
+check_memory_bindings(const struct anv_device *device,
+                     const struct anv_image *image)
+{
+#ifdef DEBUG
+   /* As we inspect each part of the image, we merge the part's memory range
+    * into these accumulation ranges.
+    */
+   struct anv_image_memory_range accum_ranges[ANV_IMAGE_MEMORY_BINDING_END];
+   for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; ++i) {
+      accum_ranges[i] = (struct anv_image_memory_range) {
+         .binding = i,
+      };
+   }
+
+   for (uint32_t p = 0; p < image->n_planes; ++p) {
+      const struct anv_image_plane *plane = &image->planes[p];
+
+      /* The binding that must contain the plane's primary surface. */
+      const enum anv_image_memory_binding primary_binding = image->disjoint
+         ? ANV_IMAGE_MEMORY_BINDING_PLANE_0 + p
+         : ANV_IMAGE_MEMORY_BINDING_MAIN;
+
+      /* Aliasing is incompatible with the private binding because it does not
+       * live in a VkDeviceMemory.  The one exception is swapchain images.
+       */
+      assert(!(image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) ||
+             image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].memory_range.size == 0);
+
+      /* Check primary surface */
+      check_memory_range(accum_ranges,
+                         .test_surface = &plane->primary_surface,
+                         .expect_binding = primary_binding);
+
+      /* Check shadow surface */
+      if (anv_surface_is_valid(&plane->shadow_surface)) {
+         check_memory_range(accum_ranges,
+                            .test_surface = &plane->shadow_surface,
+                            .expect_binding = primary_binding);
+      }
+
+      /* Check aux_surface */
+      if (anv_surface_is_valid(&plane->aux_surface)) {
+         enum anv_image_memory_binding binding = primary_binding;
+
+         /* If an auxiliary surface is used for an externally-shareable image,
+          * we have to hide this from the memory of the image since other
+          * processes with access to the memory may not be aware of it or of
+          * its current state. So put that auxiliary data into a separate
+          * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+          */
+         if (anv_image_is_externally_shared(image) &&
+             !isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+            binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+         }
+
+         /* Display hardware requires that the aux surface start at
+          * a higher address than the primary surface. The 3D hardware
+          * doesn't care, but we enforce the display requirement in case
+          * the image is sent to display.
+          */
+         check_memory_range(accum_ranges,
+                            .test_surface = &plane->aux_surface,
+                            .expect_binding = binding);
+      }
+
+      /* Check fast clear state */
+      if (plane->fast_clear_memory_range.size > 0) {
+         enum anv_image_memory_binding binding = primary_binding;
+
+         /* If an auxiliary surface is used for an externally-shareable image,
+          * we have to hide this from the memory of the image since other
+          * processes with access to the memory may not be aware of it or of
+          * its current state. So put that auxiliary data into a separate
+          * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+          */
+         if (anv_image_is_externally_shared(image)) {
+            binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+         }
+
+         /* We believe that 256B alignment may be sufficient, but we choose 4K
+          * due to lack of testing.  And MI_LOAD/STORE operations require
+          * dword-alignment.
+          */
+         assert(plane->fast_clear_memory_range.alignment == 4096);
+         check_memory_range(accum_ranges,
+                            .test_range = &plane->fast_clear_memory_range,
+                            .expect_binding = binding);
+      }
+   }
+#endif
+}
+
+/**
+ * Check that the fully-initialized anv_image is compatible with its DRM format
+ * modifier.
+ *
+ * Checking compatibility at the end of image creation is prudent, not
+ * superfluous, because usage of modifiers triggers numerous special cases
+ * throughout queries and image creation, and because
+ * vkGetPhysicalDeviceImageFormatProperties2 has difficulty detecting all
+ * incompatibilities.
+ *
+ * Return VK_ERROR_UNKNOWN if the incompatibility is difficult to detect in
+ * vkGetPhysicalDeviceImageFormatProperties2.  Otherwise, assert fail.
+ *
+ * Ideally, if vkGetPhysicalDeviceImageFormatProperties2() succeeds with a given
+ * modifier, then vkCreateImage() produces an image that is compatible with the
+ * modifier. However, it is difficult to reconcile the two functions to agree
+ * due to their complexity. For example, isl_surf_get_ccs_surf() may
+ * unexpectedly fail in vkCreateImage(), eliminating the image's aux surface
+ * even when the modifier requires one. (Maybe we should reconcile the two
+ * functions despite the difficulty).
+ */
+static VkResult MUST_CHECK
+check_drm_format_mod(const struct anv_device *device,
+                     const struct anv_image *image)
+{
+   /* Image must have a modifier if and only if it has modifier tiling. */
+   assert((image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID) ==
+          (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT));
+
+   if (image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID)
+      return VK_SUCCESS;
+
+   const struct isl_drm_modifier_info *isl_mod_info =
+      isl_drm_modifier_get_info(image->vk.drm_format_mod);
+
+   /* Driver must support the modifier. */
+   assert(isl_drm_modifier_get_score(device->info, isl_mod_info->modifier));
+
+   /* Enforced by us, not the Vulkan spec. */
+   assert(image->vk.image_type == VK_IMAGE_TYPE_2D);
+   assert(!(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT));
+   assert(!(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT));
+   assert(image->vk.mip_levels == 1);
+   assert(image->vk.array_layers == 1);
+   assert(image->vk.samples == 1);
+
+   for (int i = 0; i < image->n_planes; ++i) {
+      const struct anv_image_plane *plane = &image->planes[i];
+      ASSERTED const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(plane->primary_surface.isl.format);
+
+      /* Enforced by us, not the Vulkan spec. */
+      assert(isl_layout->txc == ISL_TXC_NONE);
+      assert(isl_layout->colorspace == ISL_COLORSPACE_LINEAR ||
+             isl_layout->colorspace == ISL_COLORSPACE_SRGB);
+      assert(!anv_surface_is_valid(&plane->shadow_surface));
+
+      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+         /* Reject DISJOINT for consistency with the GL driver. */
+         assert(!image->disjoint);
+
+         /* The modifier's required aux usage mandates the image's aux usage.
+          * The inverse, however, does not hold; if the modifier has no aux
+          * usage, then we may enable a private aux surface.
+          */
+         if (plane->aux_usage != isl_mod_info->aux_usage) {
+            return vk_errorf(device, VK_ERROR_UNKNOWN,
+                             "image with modifier unexpectedly has wrong aux "
+                             "usage");
+         }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+/**
+ * Use when the app does not provide
+ * VkImageDrmFormatModifierExplicitCreateInfoEXT.
+ */
+static VkResult MUST_CHECK
+add_all_surfaces_implicit_layout(
+   struct anv_device *device,
+   struct anv_image *image,
+   const VkImageFormatListCreateInfo *format_list_info,
+   uint32_t stride,
+   isl_tiling_flags_t isl_tiling_flags,
+   isl_surf_usage_flags_t isl_extra_usage_flags)
+{
+   const struct intel_device_info *devinfo = device->info;
+   VkResult result;
+
+   u_foreach_bit(b, image->vk.aspects) {
+      VkImageAspectFlagBits aspect = 1 << b;
+      const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+      const  struct anv_format_plane plane_format =
+         anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling);
+
+      VkImageUsageFlags vk_usage = vk_image_usage(&image->vk, aspect);
+      isl_surf_usage_flags_t isl_usage =
+         choose_isl_surf_usage(image->vk.create_flags, vk_usage,
+                               isl_extra_usage_flags, aspect);
+
+      /* Must call this before adding any surfaces because it may modify
+       * isl_tiling_flags.
+       */
+      bool needs_shadow =
+         anv_image_plane_needs_shadow_surface(devinfo, plane_format,
+                                              image->vk.tiling, vk_usage,
+                                              image->vk.create_flags,
+                                              &isl_tiling_flags);
+
+      result = add_primary_surface(device, image, plane, plane_format,
+                                   ANV_OFFSET_IMPLICIT, stride,
+                                   isl_tiling_flags, isl_usage);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (needs_shadow) {
+         result = add_shadow_surface(device, image, plane, plane_format,
+                                     stride, vk_usage);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      /* Disable aux if image supports export without modifiers. */
+      if (image->vk.external_handle_types != 0 &&
+          image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+         continue;
+
+      result = add_aux_surface_if_supported(device, image, plane, plane_format,
+                                            format_list_info,
+                                            ANV_OFFSET_IMPLICIT, stride,
+                                            isl_extra_usage_flags);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+/**
+ * Use when the app provides VkImageDrmFormatModifierExplicitCreateInfoEXT.
+ */
+static VkResult
+add_all_surfaces_explicit_layout(
+   struct anv_device *device,
+   struct anv_image *image,
+   const VkImageFormatListCreateInfo *format_list_info,
+   const VkImageDrmFormatModifierExplicitCreateInfoEXT *drm_info,
+   isl_tiling_flags_t isl_tiling_flags,
+   isl_surf_usage_flags_t isl_extra_usage_flags)
+{
+   const struct intel_device_info *devinfo = device->info;
+   const uint32_t mod_plane_count = drm_info->drmFormatModifierPlaneCount;
+   const bool mod_has_aux =
+      isl_drm_modifier_has_aux(drm_info->drmFormatModifier);
+   VkResult result;
+
+   /* About valid usage in the Vulkan spec:
+    *
+    * Unlike vanilla vkCreateImage, which produces undefined behavior on user
+    * error, here the spec requires the implementation to return
+    * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides
+    * a bad plane layout. However, the spec does require
+    * drmFormatModifierPlaneCount to be valid.
+    *
+    * Most validation of plane layout occurs in add_surface().
+    */
+
+   /* We support a restricted set of images with modifiers.
+    *
+    * With aux usage,
+    * - Format plane count must be 1.
+    * - Memory plane count must be 2.
+    * Without aux usage,
+    * - Each format plane must map to a distint memory plane.
+    *
+    * For the other cases, currently there is no way to properly map memory
+    * planes to format planes and aux planes due to the lack of defined ABI
+    * for external multi-planar images.
+    */
+   if (image->n_planes == 1)
+      assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   else
+      assert(!(image->vk.aspects & ~VK_IMAGE_ASPECT_PLANES_BITS_ANV));
+
+   if (mod_has_aux)
+      assert(image->n_planes == 1 && mod_plane_count == 2);
+   else
+      assert(image->n_planes == mod_plane_count);
+
+   /* Reject special values in the app-provided plane layouts. */
+   for (uint32_t i = 0; i < mod_plane_count; ++i) {
+      if (drm_info->pPlaneLayouts[i].rowPitch == 0) {
+         return vk_errorf(device,
+                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
+                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
+                          "pPlaneLayouts[%u]::rowPitch is 0", i);
+      }
+
+      if (drm_info->pPlaneLayouts[i].offset == ANV_OFFSET_IMPLICIT) {
+         return vk_errorf(device,
+                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
+                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
+                          "pPlaneLayouts[%u]::offset is %" PRIu64,
+                          i, ANV_OFFSET_IMPLICIT);
+      }
+   }
+
+   u_foreach_bit(b, image->vk.aspects) {
+      const VkImageAspectFlagBits aspect = 1 << b;
+      const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+      const struct anv_format_plane format_plane =
+         anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling);
+      const VkSubresourceLayout *primary_layout = &drm_info->pPlaneLayouts[plane];
+
+      result = add_primary_surface(device, image, plane,
+                                   format_plane,
+                                   primary_layout->offset,
+                                   primary_layout->rowPitch,
+                                   isl_tiling_flags,
+                                   isl_extra_usage_flags);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (mod_has_aux) {
+         const VkSubresourceLayout *aux_layout = &drm_info->pPlaneLayouts[1];
+         result = add_aux_surface_if_supported(device, image, plane,
+                                               format_plane,
+                                               format_list_info,
+                                               aux_layout->offset,
+                                               aux_layout->rowPitch,
+                                               isl_extra_usage_flags);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static const struct isl_drm_modifier_info *
+choose_drm_format_mod(const struct anv_physical_device *device,
+                      uint32_t modifier_count, const uint64_t *modifiers)
+{
+   uint64_t best_mod = UINT64_MAX;
+   uint32_t best_score = 0;
+
+   for (uint32_t i = 0; i < modifier_count; ++i) {
+      uint32_t score = isl_drm_modifier_get_score(&device->info, modifiers[i]);
+      if (score > best_score) {
+         best_mod = modifiers[i];
+         best_score = score;
+      }
+   }
+
+   if (best_score > 0)
+      return isl_drm_modifier_get_info(best_mod);
+   else
+      return NULL;
+}
+
+static VkImageUsageFlags
+anv_image_create_usage(const VkImageCreateInfo *pCreateInfo,
+                       VkImageUsageFlags usage)
+{
+   /* Add TRANSFER_SRC usage for multisample attachment images. This is
+    * because we might internally use the TRANSFER_SRC layout on them for
+    * blorp operations associated with resolving those into other attachments
+    * at the end of a subpass.
+    *
+    * Without this additional usage, we compute an incorrect AUX state in
+    * anv_layout_to_aux_state().
+    */
+   if (pCreateInfo->samples > VK_SAMPLE_COUNT_1_BIT &&
+       (usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)))
+      usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+   return usage;
+}
+
+static VkResult MUST_CHECK
+alloc_private_binding(struct anv_device *device,
+                      struct anv_image *image,
+                      const VkImageCreateInfo *create_info)
+{
+   struct anv_image_binding *binding =
+      &image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE];
+
+   if (binding->memory_range.size == 0)
+      return VK_SUCCESS;
+
+   const VkImageSwapchainCreateInfoKHR *swapchain_info =
+      vk_find_struct_const(create_info->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+      /* The image will be bound to swapchain memory. */
+      return VK_SUCCESS;
+   }
+
+   return anv_device_alloc_bo(device, "image-binding-private",
+                              binding->memory_range.size, 0, 0,
+                              &binding->address.bo);
+}
+
+VkResult
+anv_image_init(struct anv_device *device, struct anv_image *image,
+               const struct anv_image_create_info *create_info)
+{
+   const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
+   const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info = NULL;
+   const struct isl_drm_modifier_info *isl_mod_info = NULL;
+   VkResult r;
+
+   vk_image_init(&device->vk, &image->vk, pCreateInfo);
+
+   image->vk.usage = anv_image_create_usage(pCreateInfo, image->vk.usage);
+   image->vk.stencil_usage =
+      anv_image_create_usage(pCreateInfo, image->vk.stencil_usage);
+
+   if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      assert(!image->vk.wsi_legacy_scanout);
+      mod_explicit_info =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+      if (mod_explicit_info) {
+         isl_mod_info = isl_drm_modifier_get_info(mod_explicit_info->drmFormatModifier);
+      } else {
+         const struct VkImageDrmFormatModifierListCreateInfoEXT *mod_list_info =
+            vk_find_struct_const(pCreateInfo->pNext,
+                                 IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
+         isl_mod_info = choose_drm_format_mod(device->physical,
+                                              mod_list_info->drmFormatModifierCount,
+                                              mod_list_info->pDrmFormatModifiers);
+      }
+
+      assert(isl_mod_info);
+      assert(image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID);
+      image->vk.drm_format_mod = isl_mod_info->modifier;
+   }
+
+   for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; ++i) {
+      image->bindings[i] = (struct anv_image_binding) {
+         .memory_range = { .binding = i },
+      };
+   }
+
+   /* In case of AHardwareBuffer import, we don't know the layout yet */
+   if (image->vk.external_handle_types &
+       VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) {
+      image->from_ahb = true;
+      return VK_SUCCESS;
+   }
+
+   image->n_planes = anv_get_format_planes(image->vk.format);
+
+   /* The Vulkan 1.2.165 glossary says:
+    *
+    *    A disjoint image consists of multiple disjoint planes, and is created
+    *    with the VK_IMAGE_CREATE_DISJOINT_BIT bit set.
+    */
+   image->disjoint = image->n_planes > 1 &&
+                     (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT);
+
+   const isl_tiling_flags_t isl_tiling_flags =
+      choose_isl_tiling_flags(device->info, create_info, isl_mod_info,
+                              image->vk.wsi_legacy_scanout);
+
+   const VkImageFormatListCreateInfo *fmt_list =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           IMAGE_FORMAT_LIST_CREATE_INFO);
+
+   if (mod_explicit_info) {
+      r = add_all_surfaces_explicit_layout(device, image, fmt_list,
+                                           mod_explicit_info, isl_tiling_flags,
+                                           create_info->isl_extra_usage_flags);
+   } else {
+      r = add_all_surfaces_implicit_layout(device, image, fmt_list, 0,
+                                           isl_tiling_flags,
+                                           create_info->isl_extra_usage_flags);
+   }
+
+   if (r != VK_SUCCESS)
+      goto fail;
+
+   r = alloc_private_binding(device, image, pCreateInfo);
+   if (r != VK_SUCCESS)
+      goto fail;
+
+   check_memory_bindings(device, image);
+
+   r = check_drm_format_mod(device, image);
+   if (r != VK_SUCCESS)
+      goto fail;
+
+   /* Once we have all the bindings, determine whether we can do non 0 fast
+    * clears for each plane.
+    */
+   for (uint32_t p = 0; p < image->n_planes; p++) {
+      image->planes[p].can_non_zero_fast_clear =
+         can_fast_clear_with_non_zero_color(device->info, image, p, fmt_list);
+   }
+
+   return VK_SUCCESS;
+
+fail:
+   vk_image_finish(&image->vk);
+   return r;
+}
+
+void
+anv_image_finish(struct anv_image *image)
+{
+   struct anv_device *device =
+      container_of(image->vk.base.device, struct anv_device, vk);
+
+   if (image->from_gralloc) {
+      assert(!image->disjoint);
+      assert(image->n_planes == 1);
+      assert(image->planes[0].primary_surface.memory_range.binding ==
+             ANV_IMAGE_MEMORY_BINDING_MAIN);
+      assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL);
+      anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo);
+   }
+
+   struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+   if (private_bo)
+      anv_device_release_bo(device, private_bo);
+
+   vk_image_finish(&image->vk);
+}
+
+static struct anv_image *
+anv_swapchain_get_image(VkSwapchainKHR swapchain,
+                        uint32_t index)
+{
+   VkImage image = wsi_common_get_image(swapchain, index);
+   return anv_image_from_handle(image);
+}
+
+static VkResult
+anv_image_init_from_create_info(struct anv_device *device,
+                                struct anv_image *image,
+                                const VkImageCreateInfo *pCreateInfo)
+{
+   const VkNativeBufferANDROID *gralloc_info =
+      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+   if (gralloc_info)
+      return anv_image_init_from_gralloc(device, image, pCreateInfo,
+                                         gralloc_info);
+
+   struct anv_image_create_info create_info = {
+      .vk_info = pCreateInfo,
+   };
+
+   /* For dmabuf imports, configure the primary surface without support for
+    * compression if the modifier doesn't specify it. This helps to create
+    * VkImages with memory requirements that are compatible with the buffers
+    * apps provide.
+    */
+   const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+   if (mod_explicit_info &&
+       !isl_drm_modifier_has_aux(mod_explicit_info->drmFormatModifier))
+      create_info.isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
+   return anv_image_init(device, image, &create_info);
+}
+
+VkResult anv_CreateImage(
+    VkDevice                                    _device,
+    const VkImageCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImage*                                    pImage)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+#ifndef VK_USE_PLATFORM_ANDROID_KHR
+   /* Ignore swapchain creation info on Android. Since we don't have an
+    * implementation in Mesa, we're guaranteed to access an Android object
+    * incorrectly.
+    */
+   const VkImageSwapchainCreateInfoKHR *swapchain_info =
+      vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+      return wsi_common_create_swapchain_image(&device->physical->wsi_device,
+                                               pCreateInfo,
+                                               swapchain_info->swapchain,
+                                               pImage);
+   }
+#endif
+
+   struct anv_image *image =
+      vk_object_zalloc(&device->vk, pAllocator, sizeof(*image),
+                       VK_OBJECT_TYPE_IMAGE);
+   if (!image)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = anv_image_init_from_create_info(device, image,
+                                                     pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_object_free(&device->vk, pAllocator, image);
+      return result;
+   }
+
+   *pImage = anv_image_to_handle(image);
+
+   return result;
+}
+
+void
+anv_DestroyImage(VkDevice _device, VkImage _image,
+                 const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   if (!image)
+      return;
+
+   assert(&device->vk == image->vk.base.device);
+   anv_image_finish(image);
+
+   vk_free2(&device->vk.alloc, pAllocator, image);
+}
+
+/* We are binding AHardwareBuffer. Get a description, resolve the
+ * format and prepare anv_image properly.
+ */
+static void
+resolve_ahw_image(struct anv_device *device,
+                  struct anv_image *image,
+                  struct anv_device_memory *mem)
+{
+#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
+   assert(mem->ahw);
+   AHardwareBuffer_Desc desc;
+   AHardwareBuffer_describe(mem->ahw, &desc);
+   VkResult result;
+
+   /* Check tiling. */
+   enum isl_tiling tiling;
+   result = anv_device_get_bo_tiling(device, mem->bo, &tiling);
+   assert(result == VK_SUCCESS);
+
+   VkImageTiling vk_tiling =
+      tiling == ISL_TILING_LINEAR ? VK_IMAGE_TILING_LINEAR :
+                                    VK_IMAGE_TILING_OPTIMAL;
+   isl_tiling_flags_t isl_tiling_flags = (1u << tiling);
+
+   /* Check format. */
+   VkFormat vk_format = vk_format_from_android(desc.format, desc.usage);
+   enum isl_format isl_fmt = anv_get_isl_format(device->info,
+                                                vk_format,
+                                                VK_IMAGE_ASPECT_COLOR_BIT,
+                                                vk_tiling);
+   assert(isl_fmt != ISL_FORMAT_UNSUPPORTED);
+
+   /* Handle RGB(X)->RGBA fallback. */
+   switch (desc.format) {
+   case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
+   case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
+      if (isl_format_is_rgb(isl_fmt))
+         isl_fmt = isl_format_rgb_to_rgba(isl_fmt);
+      break;
+   }
+
+   /* Now we are able to fill anv_image fields properly and create
+    * isl_surface for it.
+    */
+   vk_image_set_format(&image->vk, vk_format);
+   image->n_planes = anv_get_format_planes(image->vk.format);
+
+   uint32_t stride = desc.stride *
+                     (isl_format_get_layout(isl_fmt)->bpb / 8);
+
+   result = add_all_surfaces_implicit_layout(device, image, NULL, stride,
+                                             isl_tiling_flags,
+                                             ISL_SURF_USAGE_DISABLE_AUX_BIT);
+   assert(result == VK_SUCCESS);
+#endif
+}
+
+void
+anv_image_get_memory_requirements(struct anv_device *device,
+                                  struct anv_image *image,
+                                  VkImageAspectFlags aspects,
+                                  VkMemoryRequirements2 *pMemoryRequirements)
+{
+   /* The Vulkan spec (git aaed022) says:
+    *
+    *    memoryTypeBits is a bitfield and contains one bit set for every
+    *    supported memory type for the resource. The bit `1<<i` is set if and
+    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+    *    structure for the physical device is supported.
+    *
+    * All types are currently supported for images.
+    */
+   uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
+
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *requirements = (void *)ext;
+         if (image->vk.wsi_legacy_scanout || image->from_ahb) {
+            /* If we need to set the tiling for external consumers, we need a
+             * dedicated allocation.
+             *
+             * See also anv_AllocateMemory.
+             */
+            requirements->prefersDedicatedAllocation = true;
+            requirements->requiresDedicatedAllocation = true;
+         } else {
+            requirements->prefersDedicatedAllocation = false;
+            requirements->requiresDedicatedAllocation = false;
+         }
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+   /* If the image is disjoint, then we must return the memory requirements for
+    * the single plane specified in VkImagePlaneMemoryRequirementsInfo. If
+    * non-disjoint, then exactly one set of memory requirements exists for the
+    * whole image.
+    *
+    * This is enforced by the Valid Usage for VkImageMemoryRequirementsInfo2,
+    * which requires that the app provide VkImagePlaneMemoryRequirementsInfo if
+    * and only if the image is disjoint (that is, multi-planar format and
+    * VK_IMAGE_CREATE_DISJOINT_BIT).
+    */
+   const struct anv_image_binding *binding;
+   if (image->disjoint) {
+      assert(util_bitcount(aspects) == 1);
+      assert(aspects & image->vk.aspects);
+      binding = image_aspect_to_binding(image, aspects);
+   } else {
+      assert(aspects == image->vk.aspects);
+      binding = &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+   }
+
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
+      .size = binding->memory_range.size,
+      .alignment = binding->memory_range.alignment,
+      .memoryTypeBits = memory_types,
+   };
+}
+
+void anv_GetImageMemoryRequirements2(
+    VkDevice                                    _device,
+    const VkImageMemoryRequirementsInfo2*       pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, pInfo->image);
+
+   VkImageAspectFlags aspects = image->vk.aspects;
+
+   vk_foreach_struct_const(ext, pInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
+         assert(image->disjoint);
+         const VkImagePlaneMemoryRequirementsInfo *plane_reqs =
+            (const VkImagePlaneMemoryRequirementsInfo *) ext;
+         aspects = plane_reqs->planeAspect;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+   anv_image_get_memory_requirements(device, image, aspects,
+                                     pMemoryRequirements);
+}
+
+void anv_GetDeviceImageMemoryRequirementsKHR(
+    VkDevice                                    _device,
+    const VkDeviceImageMemoryRequirements*   pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_image image = { 0 };
+
+   ASSERTED VkResult result =
+      anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo);
+   assert(result == VK_SUCCESS);
+
+   VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   anv_image_get_memory_requirements(device, &image, aspects,
+                                     pMemoryRequirements);
+}
+
+void anv_GetImageSparseMemoryRequirements(
+    VkDevice                                    device,
+    VkImage                                     image,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements*            pSparseMemoryRequirements)
+{
+   *pSparseMemoryRequirementCount = 0;
+}
+
+void anv_GetImageSparseMemoryRequirements2(
+    VkDevice                                    device,
+    const VkImageSparseMemoryRequirementsInfo2* pInfo,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2*           pSparseMemoryRequirements)
+{
+   *pSparseMemoryRequirementCount = 0;
+}
+
+void anv_GetDeviceImageSparseMemoryRequirementsKHR(
+    VkDevice                                    device,
+    const VkDeviceImageMemoryRequirements* pInfo,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2*           pSparseMemoryRequirements)
+{
+   *pSparseMemoryRequirementCount = 0;
+}
+
+VkResult anv_BindImageMemory2(
+    VkDevice                                    _device,
+    uint32_t                                    bindInfoCount,
+    const VkBindImageMemoryInfo*                pBindInfos)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   for (uint32_t i = 0; i < bindInfoCount; i++) {
+      const VkBindImageMemoryInfo *bind_info = &pBindInfos[i];
+      ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory);
+      ANV_FROM_HANDLE(anv_image, image, bind_info->image);
+      bool did_bind = false;
+
+      /* Resolve will alter the image's aspects, do this first. */
+      if (mem && mem->ahw)
+         resolve_ahw_image(device, image, mem);
+
+      vk_foreach_struct_const(s, bind_info->pNext) {
+         switch (s->sType) {
+         case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: {
+            const VkBindImagePlaneMemoryInfo *plane_info =
+               (const VkBindImagePlaneMemoryInfo *) s;
+
+            /* Workaround for possible spec bug.
+             *
+             * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that
+             * the image be disjoint (that is, multi-planar format and
+             * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows
+             * the image to be non-disjoint and requires only that the image
+             * have the DISJOINT flag. In this case, regardless of the value of
+             * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is
+             * the same as if VkImagePlaneMemoryRequirementsInfo were omitted.
+             */
+            if (!image->disjoint)
+               break;
+
+            struct anv_image_binding *binding =
+               image_aspect_to_binding(image, plane_info->planeAspect);
+
+            binding->address = (struct anv_address) {
+               .bo = mem->bo,
+               .offset = bind_info->memoryOffset,
+            };
+
+            did_bind = true;
+            break;
+         }
+         case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: {
+            /* Ignore this struct on Android, we cannot access swapchain
+             * structures there.
+             */
+#ifndef VK_USE_PLATFORM_ANDROID_KHR
+            const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+               (const VkBindImageMemorySwapchainInfoKHR *) s;
+            struct anv_image *swapchain_image =
+               anv_swapchain_get_image(swapchain_info->swapchain,
+                                       swapchain_info->imageIndex);
+            assert(swapchain_image);
+            assert(image->vk.aspects == swapchain_image->vk.aspects);
+            assert(mem == NULL);
+
+            for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j) {
+               assert(memory_ranges_equal(image->bindings[j].memory_range,
+                                          swapchain_image->bindings[j].memory_range));
+               image->bindings[j].address = swapchain_image->bindings[j].address;
+            }
+
+            /* We must bump the private binding's bo's refcount because, unlike the other
+             * bindings, its lifetime is not application-managed.
+             */
+            struct anv_bo *private_bo =
+               image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+            if (private_bo)
+               anv_bo_ref(private_bo);
+
+            did_bind = true;
+#endif
+            break;
+         }
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch"
+         case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: {
+            const VkNativeBufferANDROID *gralloc_info =
+               (const VkNativeBufferANDROID *)s;
+            VkResult result = anv_image_bind_from_gralloc(device, image,
+                                                          gralloc_info);
+            if (result != VK_SUCCESS)
+               return result;
+            did_bind = true;
+            break;
+         }
+#pragma GCC diagnostic pop
+         default:
+            anv_debug_ignored_stype(s->sType);
+            break;
+         }
+      }
+
+      if (!did_bind) {
+         assert(!image->disjoint);
+
+         image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address =
+            (struct anv_address) {
+               .bo = mem->bo,
+               .offset = bind_info->memoryOffset,
+            };
+
+         did_bind = true;
+      }
+
+      /* On platforms that use implicit CCS, if the plane's bo lacks implicit
+       * CCS then disable compression on the plane.
+       */
+      for (int p = 0; p < image->n_planes; ++p) {
+         enum anv_image_memory_binding binding =
+            image->planes[p].primary_surface.memory_range.binding;
+         const struct anv_bo *bo =
+            image->bindings[binding].address.bo;
+
+         if (!bo || bo->has_implicit_ccs)
+            continue;
+
+         if (!device->physical->has_implicit_ccs)
+            continue;
+
+         if (!isl_aux_usage_has_ccs(image->planes[p].aux_usage))
+            continue;
+
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+                       "BO lacks implicit CCS. Disabling the CCS aux usage.");
+
+         if (image->planes[p].aux_surface.memory_range.size > 0) {
+            assert(image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS ||
+                   image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT);
+            image->planes[p].aux_usage = ISL_AUX_USAGE_HIZ;
+         } else {
+            assert(image->planes[p].aux_usage == ISL_AUX_USAGE_CCS_E ||
+                   image->planes[p].aux_usage == ISL_AUX_USAGE_STC_CCS);
+            image->planes[p].aux_usage = ISL_AUX_USAGE_NONE;
+         }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_GetImageSubresourceLayout(
+    VkDevice                                    device,
+    VkImage                                     _image,
+    const VkImageSubresource*                   subresource,
+    VkSubresourceLayout*                        layout)
+{
+   ANV_FROM_HANDLE(anv_image, image, _image);
+   const struct anv_surface *surface;
+
+   assert(__builtin_popcount(subresource->aspectMask) == 1);
+
+   /* The Vulkan spec requires that aspectMask be
+    * VK_IMAGE_ASPECT_MEMORY_PLANE_i_BIT_EXT if tiling is
+    * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT.
+    *
+    * For swapchain images, the Vulkan spec says that every swapchain image has
+    * tiling VK_IMAGE_TILING_OPTIMAL, but we may choose
+    * VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT internally.  Vulkan doesn't allow
+    * vkGetImageSubresourceLayout for images with VK_IMAGE_TILING_OPTIMAL,
+    * therefore it's invalid for the application to call this on a swapchain
+    * image.  The WSI code, however, knows when it has internally created
+    * a swapchain image with VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+    * so it _should_ correctly use VK_IMAGE_ASPECT_MEMORY_PLANE_* in that case.
+    * But it incorrectly uses VK_IMAGE_ASPECT_PLANE_*, so we have a temporary
+    * workaround.
+    */
+   if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      /* TODO(chadv): Drop this workaround when WSI gets fixed. */
+      uint32_t mem_plane;
+      switch (subresource->aspectMask) {
+      case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT:
+      case VK_IMAGE_ASPECT_PLANE_0_BIT:
+         mem_plane = 0;
+         break;
+      case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT:
+      case VK_IMAGE_ASPECT_PLANE_1_BIT:
+         mem_plane = 1;
+         break;
+      case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT:
+      case VK_IMAGE_ASPECT_PLANE_2_BIT:
+         mem_plane = 2;
+         break;
+      default:
+         unreachable("bad VkImageAspectFlags");
+      }
+
+      if (mem_plane == 1 && isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+         assert(image->n_planes == 1);
+         /* If the memory binding differs between primary and aux, then the
+          * returned offset will be incorrect.
+          */
+         assert(image->planes[0].aux_surface.memory_range.binding ==
+                image->planes[0].primary_surface.memory_range.binding);
+         surface = &image->planes[0].aux_surface;
+      } else {
+         assert(mem_plane < image->n_planes);
+         surface = &image->planes[mem_plane].primary_surface;
+      }
+   } else {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, subresource->aspectMask);
+      surface = &image->planes[plane].primary_surface;
+   }
+
+   layout->offset = surface->memory_range.offset;
+   layout->rowPitch = surface->isl.row_pitch_B;
+   layout->depthPitch = isl_surf_get_array_pitch(&surface->isl);
+   layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl);
+
+   if (subresource->mipLevel > 0 || subresource->arrayLayer > 0) {
+      assert(surface->isl.tiling == ISL_TILING_LINEAR);
+
+      uint64_t offset_B;
+      isl_surf_get_image_offset_B_tile_sa(&surface->isl,
+                                          subresource->mipLevel,
+                                          subresource->arrayLayer,
+                                          0 /* logical_z_offset_px */,
+                                          &offset_B, NULL, NULL);
+      layout->offset += offset_B;
+      layout->size = layout->rowPitch * anv_minify(image->vk.extent.height,
+                                                   subresource->mipLevel) *
+                     image->vk.extent.depth;
+   } else {
+      layout->size = surface->memory_range.size;
+   }
+}
+
+/**
+ * This function returns the assumed isl_aux_state for a given VkImageLayout.
+ * Because Vulkan image layouts don't map directly to isl_aux_state enums, the
+ * returned enum is the assumed worst case.
+ *
+ * @param devinfo The device information of the Intel GPU.
+ * @param image The image that may contain a collection of buffers.
+ * @param aspect The aspect of the image to be accessed.
+ * @param layout The current layout of the image aspect(s).
+ *
+ * @return The primary buffer that should be used for the given layout.
+ */
+enum isl_aux_state ATTRIBUTE_PURE
+anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
+                        const struct anv_image * const image,
+                        const VkImageAspectFlagBits aspect,
+                        const VkImageLayout layout)
+{
+   /* Validate the inputs. */
+
+   /* The devinfo is needed as the optimal buffer varies across generations. */
+   assert(devinfo != NULL);
+
+   /* The layout of a NULL image is not properly defined. */
+   assert(image != NULL);
+
+   /* The aspect must be exactly one of the image aspects. */
+   assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
+
+   /* Determine the optimal buffer. */
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   /* If we don't have an aux buffer then aux state makes no sense */
+   const enum isl_aux_usage aux_usage = image->planes[plane].aux_usage;
+   assert(aux_usage != ISL_AUX_USAGE_NONE);
+
+   /* All images that use an auxiliary surface are required to be tiled. */
+   assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
+
+   /* Handle a few special cases */
+   switch (layout) {
+   /* Invalid layouts */
+   case VK_IMAGE_LAYOUT_MAX_ENUM:
+      unreachable("Invalid image layout.");
+
+   /* Undefined layouts
+    *
+    * The pre-initialized layout is equivalent to the undefined layout for
+    * optimally-tiled images.  We can only do color compression (CCS or HiZ)
+    * on tiled images.
+    */
+   case VK_IMAGE_LAYOUT_UNDEFINED:
+   case VK_IMAGE_LAYOUT_PREINITIALIZED:
+      return ISL_AUX_STATE_AUX_INVALID;
+
+   case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: {
+      assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+
+      enum isl_aux_state aux_state =
+         isl_drm_modifier_get_default_aux_state(image->vk.drm_format_mod);
+
+      switch (aux_state) {
+      case ISL_AUX_STATE_AUX_INVALID:
+         /* The modifier does not support compression. But, if we arrived
+          * here, then we have enabled compression on it anyway, in which case
+          * we must resolve the aux surface before we release ownership to the
+          * presentation engine (because, having no modifier, the presentation
+          * engine will not be aware of the aux surface). The presentation
+          * engine will not access the aux surface (because it is unware of
+          * it), and so the aux surface will still be resolved when we
+          * re-acquire ownership.
+          *
+          * Therefore, at ownership transfers in either direction, there does
+          * exist an aux surface despite the lack of modifier and its state is
+          * pass-through.
+          */
+         return ISL_AUX_STATE_PASS_THROUGH;
+      case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+         return ISL_AUX_STATE_COMPRESSED_NO_CLEAR;
+      default:
+         unreachable("unexpected isl_aux_state");
+      }
+   }
+
+   default:
+      break;
+   }
+
+   const bool read_only = vk_image_layout_is_read_only(layout, aspect);
+
+   const VkImageUsageFlags image_aspect_usage =
+      vk_image_usage(&image->vk, aspect);
+   const VkImageUsageFlags usage =
+      vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage;
+
+   bool aux_supported = true;
+   bool clear_supported = isl_aux_usage_has_fast_clears(aux_usage);
+
+   if ((usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) && !read_only) {
+      /* This image could be used as both an input attachment and a render
+       * target (depth, stencil, or color) at the same time and this can cause
+       * corruption.
+       *
+       * We currently only disable aux in this way for depth even though we
+       * disable it for color in GL.
+       *
+       * TODO: Should we be disabling this in more cases?
+       */
+      if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT && devinfo->ver <= 9) {
+         aux_supported = false;
+         clear_supported = false;
+      }
+   }
+
+   if (usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                VK_IMAGE_USAGE_SAMPLED_BIT |
+                VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+      switch (aux_usage) {
+      case ISL_AUX_USAGE_HIZ:
+         if (!anv_can_sample_with_hiz(devinfo, image)) {
+            aux_supported = false;
+            clear_supported = false;
+         }
+         break;
+
+      case ISL_AUX_USAGE_HIZ_CCS:
+         aux_supported = false;
+         clear_supported = false;
+         break;
+
+      case ISL_AUX_USAGE_HIZ_CCS_WT:
+         break;
+
+      case ISL_AUX_USAGE_CCS_D:
+         aux_supported = false;
+         clear_supported = false;
+         break;
+
+      case ISL_AUX_USAGE_MCS:
+         if (!anv_can_sample_mcs_with_clear(devinfo, image))
+            clear_supported = false;
+         break;
+
+      case ISL_AUX_USAGE_CCS_E:
+      case ISL_AUX_USAGE_STC_CCS:
+         break;
+
+      default:
+         unreachable("Unsupported aux usage");
+      }
+   }
+
+   switch (aux_usage) {
+   case ISL_AUX_USAGE_HIZ:
+   case ISL_AUX_USAGE_HIZ_CCS:
+   case ISL_AUX_USAGE_HIZ_CCS_WT:
+      if (aux_supported) {
+         assert(clear_supported);
+         return ISL_AUX_STATE_COMPRESSED_CLEAR;
+      } else if (read_only) {
+         return ISL_AUX_STATE_RESOLVED;
+      } else {
+         return ISL_AUX_STATE_AUX_INVALID;
+      }
+
+   case ISL_AUX_USAGE_CCS_D:
+      /* We only support clear in exactly one state */
+      if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+         assert(aux_supported);
+         assert(clear_supported);
+         return ISL_AUX_STATE_PARTIAL_CLEAR;
+      } else {
+         return ISL_AUX_STATE_PASS_THROUGH;
+      }
+
+   case ISL_AUX_USAGE_CCS_E:
+      if (aux_supported) {
+         assert(clear_supported);
+         return ISL_AUX_STATE_COMPRESSED_CLEAR;
+      } else {
+         return ISL_AUX_STATE_PASS_THROUGH;
+      }
+
+   case ISL_AUX_USAGE_MCS:
+      assert(aux_supported);
+      if (clear_supported) {
+         return ISL_AUX_STATE_COMPRESSED_CLEAR;
+      } else {
+         return ISL_AUX_STATE_COMPRESSED_NO_CLEAR;
+      }
+
+   case ISL_AUX_USAGE_STC_CCS:
+      assert(aux_supported);
+      assert(!clear_supported);
+      return ISL_AUX_STATE_COMPRESSED_NO_CLEAR;
+
+   default:
+      unreachable("Unsupported aux usage");
+   }
+}
+
+/**
+ * This function determines the optimal buffer to use for a given
+ * VkImageLayout and other pieces of information needed to make that
+ * determination. This does not determine the optimal buffer to use
+ * during a resolve operation.
+ *
+ * @param devinfo The device information of the Intel GPU.
+ * @param image The image that may contain a collection of buffers.
+ * @param aspect The aspect of the image to be accessed.
+ * @param usage The usage which describes how the image will be accessed.
+ * @param layout The current layout of the image aspect(s).
+ *
+ * @return The primary buffer that should be used for the given layout.
+ */
+enum isl_aux_usage ATTRIBUTE_PURE
+anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
+                        const struct anv_image * const image,
+                        const VkImageAspectFlagBits aspect,
+                        const VkImageUsageFlagBits usage,
+                        const VkImageLayout layout)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   /* If there is no auxiliary surface allocated, we must use the one and only
+    * main buffer.
+    */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
+      return ISL_AUX_USAGE_NONE;
+
+   enum isl_aux_state aux_state =
+      anv_layout_to_aux_state(devinfo, image, aspect, layout);
+
+   switch (aux_state) {
+   case ISL_AUX_STATE_CLEAR:
+      unreachable("We never use this state");
+
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+      assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+      assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D);
+      assert(image->vk.samples == 1);
+      return ISL_AUX_USAGE_CCS_D;
+
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+      return image->planes[plane].aux_usage;
+
+   case ISL_AUX_STATE_RESOLVED:
+      /* We can only use RESOLVED in read-only layouts because any write will
+       * either land us in AUX_INVALID or COMPRESSED_NO_CLEAR.  We can do
+       * writes in PASS_THROUGH without destroying it so that is allowed.
+       */
+      assert(vk_image_layout_is_read_only(layout, aspect));
+      assert(util_is_power_of_two_or_zero(usage));
+      if (usage == VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+         /* If we have valid HiZ data and are using the image as a read-only
+          * depth/stencil attachment, we should enable HiZ so that we can get
+          * faster depth testing.
+          */
+         return image->planes[plane].aux_usage;
+      } else {
+         return ISL_AUX_USAGE_NONE;
+      }
+
+   case ISL_AUX_STATE_PASS_THROUGH:
+   case ISL_AUX_STATE_AUX_INVALID:
+      return ISL_AUX_USAGE_NONE;
+   }
+
+   unreachable("Invalid isl_aux_state");
+}
+
+/**
+ * This function returns the level of unresolved fast-clear support of the
+ * given image in the given VkImageLayout.
+ *
+ * @param devinfo The device information of the Intel GPU.
+ * @param image The image that may contain a collection of buffers.
+ * @param aspect The aspect of the image to be accessed.
+ * @param usage The usage which describes how the image will be accessed.
+ * @param layout The current layout of the image aspect(s).
+ */
+enum anv_fast_clear_type ATTRIBUTE_PURE
+anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
+                              const struct anv_image * const image,
+                              const VkImageAspectFlagBits aspect,
+                              const VkImageLayout layout)
+{
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+      return ANV_FAST_CLEAR_NONE;
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   /* If there is no auxiliary surface allocated, there are no fast-clears */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
+      return ANV_FAST_CLEAR_NONE;
+
+   /* We don't support MSAA fast-clears on Ivybridge or Bay Trail because they
+    * lack the MI ALU which we need to determine the predicates.
+    */
+   if (devinfo->verx10 == 70 && image->vk.samples > 1)
+      return ANV_FAST_CLEAR_NONE;
+
+   enum isl_aux_state aux_state =
+      anv_layout_to_aux_state(devinfo, image, aspect, layout);
+
+   switch (aux_state) {
+   case ISL_AUX_STATE_CLEAR:
+      unreachable("We never use this state");
+
+   case ISL_AUX_STATE_PARTIAL_CLEAR:
+   case ISL_AUX_STATE_COMPRESSED_CLEAR:
+      if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
+         return ANV_FAST_CLEAR_DEFAULT_VALUE;
+      } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+         /* The image might not support non zero fast clears when mutable. */
+         if (!image->planes[plane].can_non_zero_fast_clear)
+            return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
+         /* When we're in a render pass we have the clear color data from the
+          * VkRenderPassBeginInfo and we can use arbitrary clear colors.  They
+          * must get partially resolved before we leave the render pass.
+          */
+         return ANV_FAST_CLEAR_ANY;
+      } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
+                 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+         if (devinfo->ver >= 11) {
+            /* The image might not support non zero fast clears when mutable. */
+            if (!image->planes[plane].can_non_zero_fast_clear)
+               return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
+            /* On ICL and later, the sampler hardware uses a copy of the clear
+             * value that is encoded as a pixel value.  Therefore, we can use
+             * any clear color we like for sampling.
+             */
+            return ANV_FAST_CLEAR_ANY;
+         } else {
+            /* If the image has MCS or CCS_E enabled all the time then we can
+             * use fast-clear as long as the clear color is the default value
+             * of zero since this is the default value we program into every
+             * surface state used for texturing.
+             */
+            return ANV_FAST_CLEAR_DEFAULT_VALUE;
+         }
+      } else {
+         return ANV_FAST_CLEAR_NONE;
+      }
+
+   case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
+   case ISL_AUX_STATE_RESOLVED:
+   case ISL_AUX_STATE_PASS_THROUGH:
+   case ISL_AUX_STATE_AUX_INVALID:
+      return ANV_FAST_CLEAR_NONE;
+   }
+
+   unreachable("Invalid isl_aux_state");
+}
+
+
+static struct anv_state
+alloc_surface_state(struct anv_device *device)
+{
+   return anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+}
+
+static enum isl_channel_select
+remap_swizzle(VkComponentSwizzle swizzle,
+              struct isl_swizzle format_swizzle)
+{
+   switch (swizzle) {
+   case VK_COMPONENT_SWIZZLE_ZERO:  return ISL_CHANNEL_SELECT_ZERO;
+   case VK_COMPONENT_SWIZZLE_ONE:   return ISL_CHANNEL_SELECT_ONE;
+   case VK_COMPONENT_SWIZZLE_R:     return format_swizzle.r;
+   case VK_COMPONENT_SWIZZLE_G:     return format_swizzle.g;
+   case VK_COMPONENT_SWIZZLE_B:     return format_swizzle.b;
+   case VK_COMPONENT_SWIZZLE_A:     return format_swizzle.a;
+   default:
+      unreachable("Invalid swizzle");
+   }
+}
+
+void
+anv_image_fill_surface_state(struct anv_device *device,
+                             const struct anv_image *image,
+                             VkImageAspectFlagBits aspect,
+                             const struct isl_view *view_in,
+                             isl_surf_usage_flags_t view_usage,
+                             enum isl_aux_usage aux_usage,
+                             const union isl_color_value *clear_color,
+                             enum anv_image_view_state_flags flags,
+                             struct anv_surface_state *state_inout,
+                             struct brw_image_param *image_param_out)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   const struct anv_surface *surface = &image->planes[plane].primary_surface,
+      *aux_surface = &image->planes[plane].aux_surface;
+
+   struct isl_view view = *view_in;
+   view.usage |= view_usage;
+
+   /* For texturing with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL from a
+    * compressed surface with a shadow surface, we use the shadow instead of
+    * the primary surface.  The shadow surface will be tiled, unlike the main
+    * surface, so it should get significantly better performance.
+    */
+   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
+       isl_format_is_compressed(view.format) &&
+       (flags & ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL)) {
+      assert(isl_format_is_compressed(surface->isl.format));
+      assert(surface->isl.tiling == ISL_TILING_LINEAR);
+      assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
+      surface = &image->planes[plane].shadow_surface;
+   }
+
+   /* For texturing from stencil on gfx7, we have to sample from a shadow
+    * surface because we don't support W-tiling in the sampler.
+    */
+   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
+       aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
+      assert(device->info->ver == 7);
+      assert(view_usage & ISL_SURF_USAGE_TEXTURE_BIT);
+      surface = &image->planes[plane].shadow_surface;
+   }
+
+   if (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT)
+      view.swizzle = anv_swizzle_for_render(view.swizzle);
+
+   /* On Ivy Bridge and Bay Trail we do the swizzle in the shader */
+   if (device->info->verx10 == 70)
+      view.swizzle = ISL_SWIZZLE_IDENTITY;
+
+   /* If this is a HiZ buffer we can sample from with a programmable clear
+    * value (SKL+), define the clear value to the optimal constant.
+    */
+   union isl_color_value default_clear_color = { .u32 = { 0, } };
+   if (device->info->ver >= 9 && aspect == VK_IMAGE_ASPECT_DEPTH_BIT)
+      default_clear_color.f32[0] = ANV_HZ_FC_VAL;
+   if (!clear_color)
+      clear_color = &default_clear_color;
+
+   const struct anv_address address =
+      anv_image_address(image, &surface->memory_range);
+
+   if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
+       (flags & ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED) &&
+       !isl_has_matching_typed_storage_image_format(device->info,
+                                                    view.format)) {
+      /* In this case, we are a writeable storage buffer which needs to be
+       * lowered to linear. All tiling and offset calculations will be done in
+       * the shader.
+       */
+      assert(aux_usage == ISL_AUX_USAGE_NONE);
+      isl_buffer_fill_state(&device->isl_dev, state_inout->state.map,
+                            .address = anv_address_physical(address),
+                            .size_B = surface->isl.size_B,
+                            .format = ISL_FORMAT_RAW,
+                            .swizzle = ISL_SWIZZLE_IDENTITY,
+                            .stride_B = 1,
+                            .mocs = anv_mocs(device, address.bo, view_usage));
+      state_inout->address = address,
+      state_inout->aux_address = ANV_NULL_ADDRESS;
+      state_inout->clear_address = ANV_NULL_ADDRESS;
+   } else {
+      if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
+          (flags & ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED)) {
+         /* Typed surface reads support a very limited subset of the shader
+          * image formats.  Translate it into the closest format the hardware
+          * supports.
+          */
+         enum isl_format lower_format =
+            isl_lower_storage_image_format(device->info, view.format);
+         if (aux_usage != ISL_AUX_USAGE_NONE) {
+            assert(device->info->verx10 >= 125);
+            assert(aux_usage == ISL_AUX_USAGE_CCS_E);
+            assert(isl_formats_are_ccs_e_compatible(device->info,
+                                                    view.format,
+                                                    lower_format));
+         }
+
+         /* If we lower the format, we should ensure either they both match in
+          * bits per channel or that there is no swizzle, because we can't use
+          * the swizzle for a different bit pattern.
+          */
+         assert(isl_formats_have_same_bits_per_channel(lower_format,
+                                                       view.format) ||
+                isl_swizzle_is_identity_for_format(view.format, view.swizzle));
+
+         view.format = lower_format;
+      }
+
+      const struct isl_surf *isl_surf = &surface->isl;
+
+      struct isl_surf tmp_surf;
+      uint64_t offset_B = 0;
+      uint32_t tile_x_sa = 0, tile_y_sa = 0;
+      if (isl_format_is_compressed(surface->isl.format) &&
+          !isl_format_is_compressed(view.format)) {
+         /* We're creating an uncompressed view of a compressed surface.  This
+          * is allowed but only for a single level/layer.
+          */
+         assert(surface->isl.samples == 1);
+         assert(view.levels == 1);
+         assert(view.array_len == 1);
+
+         ASSERTED bool ok =
+            isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view,
+                                           &tmp_surf, &view,
+                                           &offset_B, &tile_x_sa, &tile_y_sa);
+         assert(ok);
+         isl_surf = &tmp_surf;
+
+         if (device->info->ver <= 8) {
+            assert(surface->isl.tiling == ISL_TILING_LINEAR);
+            assert(tile_x_sa == 0);
+            assert(tile_y_sa == 0);
+         }
+      }
+
+      state_inout->address = anv_address_add(address, offset_B);
+
+      struct anv_address aux_address = ANV_NULL_ADDRESS;
+      if (aux_usage != ISL_AUX_USAGE_NONE)
+         aux_address = anv_image_address(image, &aux_surface->memory_range);
+      state_inout->aux_address = aux_address;
+
+      struct anv_address clear_address = ANV_NULL_ADDRESS;
+      if (device->info->ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) {
+         clear_address = anv_image_get_clear_color_addr(device, image, aspect);
+      }
+      state_inout->clear_address = clear_address;
+
+      isl_surf_fill_state(&device->isl_dev, state_inout->state.map,
+                          .surf = isl_surf,
+                          .view = &view,
+                          .address = anv_address_physical(state_inout->address),
+                          .clear_color = *clear_color,
+                          .aux_surf = &aux_surface->isl,
+                          .aux_usage = aux_usage,
+                          .aux_address = anv_address_physical(aux_address),
+                          .clear_address = anv_address_physical(clear_address),
+                          .use_clear_address = !anv_address_is_null(clear_address),
+                          .mocs = anv_mocs(device, state_inout->address.bo,
+                                           view_usage),
+                          .x_offset_sa = tile_x_sa,
+                          .y_offset_sa = tile_y_sa);
+
+      /* With the exception of gfx8, the bottom 12 bits of the MCS base address
+       * are used to store other information.  This should be ok, however,
+       * because the surface buffer addresses are always 4K page aligned.
+       */
+      if (!anv_address_is_null(aux_address)) {
+         uint32_t *aux_addr_dw = state_inout->state.map +
+            device->isl_dev.ss.aux_addr_offset;
+         assert((aux_address.offset & 0xfff) == 0);
+         state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
+      }
+
+      if (device->info->ver >= 10 && clear_address.bo) {
+         uint32_t *clear_addr_dw = state_inout->state.map +
+                                   device->isl_dev.ss.clear_color_state_offset;
+         assert((clear_address.offset & 0x3f) == 0);
+         state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
+      }
+   }
+
+   if (image_param_out) {
+      assert(view_usage == ISL_SURF_USAGE_STORAGE_BIT);
+      isl_surf_fill_image_param(&device->isl_dev, image_param_out,
+                                &surface->isl, &view);
+   }
+}
+
+static uint32_t
+anv_image_aspect_get_planes(VkImageAspectFlags aspect_mask)
+{
+   anv_assert_valid_aspect_set(aspect_mask);
+   return util_bitcount(aspect_mask);
+}
+
+VkResult
+anv_CreateImageView(VkDevice _device,
+                    const VkImageViewCreateInfo *pCreateInfo,
+                    const VkAllocationCallbacks *pAllocator,
+                    VkImageView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+   struct anv_image_view *iview;
+
+   iview = vk_image_view_create(&device->vk, false, pCreateInfo,
+                                pAllocator, sizeof(*iview));
+   if (iview == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   iview->image = image;
+   iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects);
+
+   /* Check if a conversion info was passed. */
+   const struct anv_format *conv_format = NULL;
+   const VkSamplerYcbcrConversionInfo *conv_info =
+      vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
+
+#ifdef ANDROID
+   /* If image has an external format, the pNext chain must contain an
+    * instance of VKSamplerYcbcrConversionInfo with a conversion object
+    * created with the same external format as image."
+    */
+   assert(!image->vk.android_external_format || conv_info);
+#endif
+
+   if (conv_info) {
+      ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, conv_info->conversion);
+      conv_format = conversion->format;
+   }
+
+#ifdef ANDROID
+   /* "If image has an external format, format must be VK_FORMAT_UNDEFINED." */
+   assert(!image->vk.android_external_format ||
+          pCreateInfo->format == VK_FORMAT_UNDEFINED);
+#endif
+
+   /* Format is undefined, this can happen when using external formats. Set
+    * view format from the passed conversion info.
+    */
+   if (iview->vk.view_format == VK_FORMAT_UNDEFINED && conv_format)
+      iview->vk.view_format = conv_format->vk_format;
+
+   /* Now go through the underlying image selected planes and map them to
+    * planes in the image view.
+    */
+   anv_foreach_image_aspect_bit(iaspect_bit, image, iview->vk.aspects) {
+      const uint32_t iplane =
+         anv_aspect_to_plane(image->vk.aspects, 1UL << iaspect_bit);
+      const uint32_t vplane =
+         anv_aspect_to_plane(iview->vk.aspects, 1UL << iaspect_bit);
+      struct anv_format_plane format;
+      format = anv_get_format_plane(device->info, iview->vk.view_format,
+                                    vplane, image->vk.tiling);
+
+      iview->planes[vplane].image_plane = iplane;
+
+      iview->planes[vplane].isl = (struct isl_view) {
+         .format = format.isl_format,
+         .base_level = iview->vk.base_mip_level,
+         .levels = iview->vk.level_count,
+         .base_array_layer = iview->vk.base_array_layer,
+         .array_len = iview->vk.layer_count,
+         .min_lod_clamp = iview->vk.min_lod,
+         .swizzle = {
+            .r = remap_swizzle(iview->vk.swizzle.r, format.swizzle),
+            .g = remap_swizzle(iview->vk.swizzle.g, format.swizzle),
+            .b = remap_swizzle(iview->vk.swizzle.b, format.swizzle),
+            .a = remap_swizzle(iview->vk.swizzle.a, format.swizzle),
+         },
+      };
+
+      if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D) {
+         iview->planes[vplane].isl.base_array_layer = 0;
+         iview->planes[vplane].isl.array_len = iview->vk.extent.depth;
+      }
+
+      if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE ||
+          pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
+         iview->planes[vplane].isl.usage = ISL_SURF_USAGE_CUBE_BIT;
+      } else {
+         iview->planes[vplane].isl.usage = 0;
+      }
+
+      if (iview->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                             VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+         iview->planes[vplane].optimal_sampler_surface_state.state = alloc_surface_state(device);
+         iview->planes[vplane].general_sampler_surface_state.state = alloc_surface_state(device);
+
+         enum isl_aux_usage general_aux_usage =
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
+                                    VK_IMAGE_USAGE_SAMPLED_BIT,
+                                    VK_IMAGE_LAYOUT_GENERAL);
+         enum isl_aux_usage optimal_aux_usage =
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
+                                    VK_IMAGE_USAGE_SAMPLED_BIT,
+                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+
+         anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
+                                      &iview->planes[vplane].isl,
+                                      ISL_SURF_USAGE_TEXTURE_BIT,
+                                      optimal_aux_usage, NULL,
+                                      ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL,
+                                      &iview->planes[vplane].optimal_sampler_surface_state,
+                                      NULL);
+
+         anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
+                                      &iview->planes[vplane].isl,
+                                      ISL_SURF_USAGE_TEXTURE_BIT,
+                                      general_aux_usage, NULL,
+                                      0,
+                                      &iview->planes[vplane].general_sampler_surface_state,
+                                      NULL);
+      }
+
+      /* NOTE: This one needs to go last since it may stomp isl_view.format */
+      if (iview->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+         enum isl_aux_usage general_aux_usage =
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
+                                    VK_IMAGE_USAGE_STORAGE_BIT,
+                                    VK_IMAGE_LAYOUT_GENERAL);
+         iview->planes[vplane].storage_surface_state.state = alloc_surface_state(device);
+         anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
+                                      &iview->planes[vplane].isl,
+                                      ISL_SURF_USAGE_STORAGE_BIT,
+                                      general_aux_usage, NULL,
+                                      0,
+                                      &iview->planes[vplane].storage_surface_state,
+                                      NULL);
+
+         if (isl_is_storage_image_format(format.isl_format)) {
+            iview->planes[vplane].lowered_storage_surface_state.state =
+               alloc_surface_state(device);
+
+            anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
+                                         &iview->planes[vplane].isl,
+                                         ISL_SURF_USAGE_STORAGE_BIT,
+                                         general_aux_usage, NULL,
+                                         ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED,
+                                         &iview->planes[vplane].lowered_storage_surface_state,
+                                         device->info->ver >= 9 ? NULL :
+                                         &iview->planes[vplane].lowered_storage_image_param);
+         } else {
+            /* In this case, we support the format but, because there's no
+             * SPIR-V format specifier corresponding to it, we only support it
+             * if the hardware can do it natively.  This is possible for some
+             * reads but for most writes.  Instead of hanging if someone gets
+             * it wrong, we give them a NULL descriptor.
+             */
+            assert(isl_format_supports_typed_writes(device->info,
+                                                    format.isl_format));
+            iview->planes[vplane].lowered_storage_surface_state.state =
+               device->null_surface_state;
+         }
+      }
+   }
+
+   *pView = anv_image_view_to_handle(iview);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyImageView(VkDevice _device, VkImageView _iview,
+                     const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image_view, iview, _iview);
+
+   if (!iview)
+      return;
+
+   for (uint32_t plane = 0; plane < iview->n_planes; plane++) {
+      /* Check offset instead of alloc_size because this they might be
+       * device->null_surface_state which always has offset == 0.  We don't
+       * own that one so we don't want to accidentally free it.
+       */
+      if (iview->planes[plane].optimal_sampler_surface_state.state.offset) {
+         anv_state_pool_free(&device->surface_state_pool,
+                             iview->planes[plane].optimal_sampler_surface_state.state);
+      }
+
+      if (iview->planes[plane].general_sampler_surface_state.state.offset) {
+         anv_state_pool_free(&device->surface_state_pool,
+                             iview->planes[plane].general_sampler_surface_state.state);
+      }
+
+      if (iview->planes[plane].storage_surface_state.state.offset) {
+         anv_state_pool_free(&device->surface_state_pool,
+                             iview->planes[plane].storage_surface_state.state);
+      }
+
+      if (iview->planes[plane].lowered_storage_surface_state.state.offset) {
+         anv_state_pool_free(&device->surface_state_pool,
+                             iview->planes[plane].lowered_storage_surface_state.state);
+      }
+   }
+
+   vk_image_view_destroy(&device->vk, pAllocator, &iview->vk);
+}
+
+
+VkResult
+anv_CreateBufferView(VkDevice _device,
+                     const VkBufferViewCreateInfo *pCreateInfo,
+                     const VkAllocationCallbacks *pAllocator,
+                     VkBufferView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
+   struct anv_buffer_view *view;
+
+   view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
+                          VK_OBJECT_TYPE_BUFFER_VIEW);
+   if (!view)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct anv_format_plane format;
+   format = anv_get_format_plane(device->info, pCreateInfo->format,
+                                 0, VK_IMAGE_TILING_LINEAR);
+
+   const uint32_t format_bs = isl_format_get_layout(format.isl_format)->bpb / 8;
+   view->range = vk_buffer_range(&buffer->vk, pCreateInfo->offset,
+                                              pCreateInfo->range);
+   view->range = align_down_npot_u32(view->range, format_bs);
+
+   view->address = anv_address_add(buffer->address, pCreateInfo->offset);
+
+   if (buffer->vk.usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
+      view->surface_state = alloc_surface_state(device);
+
+      anv_fill_buffer_surface_state(device, view->surface_state,
+                                    format.isl_format, format.swizzle,
+                                    ISL_SURF_USAGE_TEXTURE_BIT,
+                                    view->address, view->range, format_bs);
+   } else {
+      view->surface_state = (struct anv_state){ 0 };
+   }
+
+   if (buffer->vk.usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
+      view->storage_surface_state = alloc_surface_state(device);
+      view->lowered_storage_surface_state = alloc_surface_state(device);
+
+      anv_fill_buffer_surface_state(device, view->storage_surface_state,
+                                    format.isl_format, format.swizzle,
+                                    ISL_SURF_USAGE_STORAGE_BIT,
+                                    view->address, view->range, format_bs);
+
+      enum isl_format lowered_format =
+         isl_has_matching_typed_storage_image_format(device->info,
+                                                     format.isl_format) ?
+         isl_lower_storage_image_format(device->info, format.isl_format) :
+         ISL_FORMAT_RAW;
+
+      /* If we lower the format, we should ensure either they both match in
+       * bits per channel or that there is no swizzle because we can't use
+       * the swizzle for a different bit pattern.
+       */
+      assert(isl_formats_have_same_bits_per_channel(lowered_format,
+                                                    format.isl_format) ||
+             isl_swizzle_is_identity(format.swizzle));
+
+      anv_fill_buffer_surface_state(device, view->lowered_storage_surface_state,
+                                    lowered_format, format.swizzle,
+                                    ISL_SURF_USAGE_STORAGE_BIT,
+                                    view->address, view->range,
+                                    (lowered_format == ISL_FORMAT_RAW ? 1 :
+                                     isl_format_get_layout(lowered_format)->bpb / 8));
+
+      isl_buffer_fill_image_param(&device->isl_dev,
+                                  &view->lowered_storage_image_param,
+                                  format.isl_format, view->range);
+   } else {
+      view->storage_surface_state = (struct anv_state){ 0 };
+      view->lowered_storage_surface_state = (struct anv_state){ 0 };
+   }
+
+   *pView = anv_buffer_view_to_handle(view);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
+                      const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_buffer_view, view, bufferView);
+
+   if (!view)
+      return;
+
+   if (view->surface_state.alloc_size > 0)
+      anv_state_pool_free(&device->surface_state_pool,
+                          view->surface_state);
+
+   if (view->storage_surface_state.alloc_size > 0)
+      anv_state_pool_free(&device->surface_state_pool,
+                          view->storage_surface_state);
+
+   if (view->lowered_storage_surface_state.alloc_size > 0)
+      anv_state_pool_free(&device->surface_state_pool,
+                          view->lowered_storage_surface_state);
+
+   vk_object_free(&device->vk, pAllocator, view);
+}
diff --git a/src/intel/vulkan_hasvk/anv_measure.c b/src/intel/vulkan_hasvk/anv_measure.c
new file mode 100644
index 00000000000..f1e4d0eeba9
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_measure.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "anv_measure.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "common/intel_measure.h"
+#include "util/debug.h"
+
+struct anv_measure_batch {
+   struct anv_bo *bo;
+   struct intel_measure_batch base;
+};
+
+void
+anv_measure_device_init(struct anv_physical_device *device)
+{
+   switch (device->info.verx10) {
+   case 125:
+      device->cmd_emit_timestamp = &gfx125_cmd_emit_timestamp;
+      break;
+   case 120:
+      device->cmd_emit_timestamp = &gfx12_cmd_emit_timestamp;
+      break;
+   case 110:
+      device->cmd_emit_timestamp = &gfx11_cmd_emit_timestamp;
+      break;
+   case 90:
+      device->cmd_emit_timestamp = &gfx9_cmd_emit_timestamp;
+      break;
+   case 80:
+      device->cmd_emit_timestamp = &gfx8_cmd_emit_timestamp;
+      break;
+   case 75:
+      device->cmd_emit_timestamp = &gfx75_cmd_emit_timestamp;
+      break;
+   case 70:
+      device->cmd_emit_timestamp = &gfx7_cmd_emit_timestamp;
+      break;
+   default:
+      assert(false);
+   }
+
+   /* initialise list of measure structures that await rendering */
+   struct intel_measure_device *measure_device = &device->measure_device;
+   intel_measure_init(measure_device);
+   struct intel_measure_config *config = measure_device->config;
+   if (config == NULL)
+      return;
+
+   /* the final member of intel_measure_ringbuffer is a zero-length array of
+    * intel_measure_buffered_result objects.  Allocate additional space for
+    * the buffered objects based on the run-time configurable buffer_size
+    */
+   const size_t rb_bytes = sizeof(struct intel_measure_ringbuffer) +
+      config->buffer_size * sizeof(struct intel_measure_buffered_result);
+   struct intel_measure_ringbuffer * rb =
+      vk_zalloc(&device->instance->vk.alloc,
+                rb_bytes, 8,
+                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   measure_device->ringbuffer = rb;
+}
+
+static struct intel_measure_config*
+config_from_command_buffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   return cmd_buffer->device->physical->measure_device.config;
+}
+
+void
+anv_measure_init(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_device *device = cmd_buffer->device;
+
+   if (!config || !config->enabled) {
+      cmd_buffer->measure = NULL;
+      return;
+   }
+
+   /* the final member of anv_measure is a zero-length array of
+    * intel_measure_snapshot objects.  Create additional space for the
+    * snapshot objects based on the run-time configurable batch_size
+    */
+   const size_t batch_bytes = sizeof(struct anv_measure_batch) +
+      config->batch_size * sizeof(struct intel_measure_snapshot);
+   struct anv_measure_batch * measure =
+      vk_alloc(&cmd_buffer->vk.pool->alloc,
+               batch_bytes, 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   memset(measure, 0, batch_bytes);
+   ASSERTED VkResult result =
+      anv_device_alloc_bo(device, "measure data",
+                          config->batch_size * sizeof(uint64_t),
+                          ANV_BO_ALLOC_MAPPED,
+                          0,
+                          (struct anv_bo**)&measure->bo);
+   measure->base.timestamps = measure->bo->map;
+   assert(result == VK_SUCCESS);
+
+   cmd_buffer->measure = measure;
+}
+
+static void
+anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
+                           enum intel_measure_snapshot_type type,
+                           const char *event_name,
+                           uint32_t count)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+   struct anv_physical_device *device = cmd_buffer->device->physical;
+   struct intel_measure_device *measure_device = &device->measure_device;
+
+   const unsigned device_frame = measure_device->frame;
+
+   /* if the command buffer is not associated with a frame, associate it with
+    * the most recent acquired frame
+    */
+   if (measure->base.frame == 0)
+      measure->base.frame = device_frame;
+
+//   uintptr_t framebuffer = (uintptr_t)cmd_buffer->state.framebuffer;
+//
+//   if (!measure->base.framebuffer &&
+//       cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
+//      /* secondary command buffer inherited the framebuffer from the primary */
+//      measure->base.framebuffer = framebuffer;
+//
+//   /* verify framebuffer has been properly tracked */
+//   assert(type == INTEL_SNAPSHOT_END ||
+//          framebuffer == measure->base.framebuffer ||
+//          framebuffer == 0 ); /* compute has no framebuffer */
+
+   unsigned index = measure->base.index++;
+
+   (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+                                 (struct anv_address) {
+                                    .bo = measure->bo,
+                                    .offset = index * sizeof(uint64_t) },
+                                 true /* end_of_pipe */);
+
+   if (event_name == NULL)
+      event_name = intel_measure_snapshot_string(type);
+
+   struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
+   memset(snapshot, 0, sizeof(*snapshot));
+   snapshot->type = type;
+   snapshot->count = (unsigned) count;
+   snapshot->event_count = measure->base.event_count;
+   snapshot->event_name = event_name;
+//   snapshot->framebuffer = framebuffer;
+
+   if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.pipeline) {
+      snapshot->cs = (uintptr_t) cmd_buffer->state.compute.pipeline->cs;
+   } else if (cmd_buffer->state.gfx.pipeline) {
+      const struct anv_graphics_pipeline *pipeline =
+         cmd_buffer->state.gfx.pipeline;
+      snapshot->vs = (uintptr_t) pipeline->shaders[MESA_SHADER_VERTEX];
+      snapshot->tcs = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_CTRL];
+      snapshot->tes = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_EVAL];
+      snapshot->gs = (uintptr_t) pipeline->shaders[MESA_SHADER_GEOMETRY];
+      snapshot->fs = (uintptr_t) pipeline->shaders[MESA_SHADER_FRAGMENT];
+   }
+}
+
+static void
+anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
+                         uint32_t event_count)
+{
+   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+   struct anv_physical_device *device = cmd_buffer->device->physical;
+
+   unsigned index = measure->base.index++;
+   assert(index % 2 == 1);
+
+   (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+                                 (struct anv_address) {
+                                    .bo = measure->bo,
+                                    .offset = index * sizeof(uint64_t) },
+                                 true /* end_of_pipe */);
+
+   struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
+   memset(snapshot, 0, sizeof(*snapshot));
+   snapshot->type = INTEL_SNAPSHOT_END;
+   snapshot->event_count = event_count;
+}
+
+static bool
+state_changed(struct anv_cmd_buffer *cmd_buffer,
+              enum intel_measure_snapshot_type type)
+{
+   uintptr_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0;
+
+   if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
+      /* can't record timestamps in this mode */
+      return false;
+
+   if (type == INTEL_SNAPSHOT_COMPUTE) {
+      const struct anv_compute_pipeline *cs_pipe =
+         cmd_buffer->state.compute.pipeline;
+      assert(cs_pipe);
+      cs = (uintptr_t)cs_pipe->cs;
+   } else if (type == INTEL_SNAPSHOT_DRAW) {
+      const struct anv_graphics_pipeline *gfx = cmd_buffer->state.gfx.pipeline;
+      assert(gfx);
+      vs = (uintptr_t) gfx->shaders[MESA_SHADER_VERTEX];
+      tcs = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_CTRL];
+      tes = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_EVAL];
+      gs = (uintptr_t) gfx->shaders[MESA_SHADER_GEOMETRY];
+      fs = (uintptr_t) gfx->shaders[MESA_SHADER_FRAGMENT];
+   }
+   /* else blorp, all programs NULL */
+
+   return intel_measure_state_changed(&cmd_buffer->measure->base,
+                                      vs, tcs, tes, gs, fs, cs);
+}
+
+void
+_anv_measure_snapshot(struct anv_cmd_buffer *cmd_buffer,
+                     enum intel_measure_snapshot_type type,
+                     const char *event_name,
+                     uint32_t count)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+
+   assert(config);
+   if (measure == NULL)
+      return;
+
+   assert(type != INTEL_SNAPSHOT_END);
+   if (!state_changed(cmd_buffer, type)) {
+      /* filter out this event */
+      return;
+   }
+
+   /* increment event count */
+   ++measure->base.event_count;
+   if (measure->base.event_count == 1 ||
+       measure->base.event_count == config->event_interval + 1) {
+      /* the first event of an interval */
+
+      if (measure->base.index % 2) {
+         /* end the previous event */
+         anv_measure_end_snapshot(cmd_buffer, measure->base.event_count - 1);
+      }
+      measure->base.event_count = 1;
+
+      if (measure->base.index == config->batch_size) {
+         /* Snapshot buffer is full.  The batch must be flushed before
+          * additional snapshots can be taken.
+          */
+         static bool warned = false;
+         if (unlikely(!warned)) {
+            fprintf(config->file,
+                    "WARNING: batch size exceeds INTEL_MEASURE limit: %d. "
+                    "Data has been dropped. "
+                    "Increase setting with INTEL_MEASURE=batch_size={count}\n",
+                    config->batch_size);
+         }
+
+         warned = true;
+         return;
+      }
+
+      anv_measure_start_snapshot(cmd_buffer, type, event_name, count);
+   }
+}
+
+/**
+ * Called when a command buffer is reset.  Re-initializes existing anv_measure
+ * data structures.
+ */
+void
+anv_measure_reset(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+
+   if (!config)
+      return;
+
+   if (!config->enabled) {
+      cmd_buffer->measure = NULL;
+      return;
+   }
+
+   if (!measure) {
+      /* Capture has recently been enabled. Instead of resetting, a new data
+       * structure must be allocated and initialized.
+       */
+      return anv_measure_init(cmd_buffer);
+   }
+
+   /* it is possible that the command buffer contains snapshots that have not
+    * yet been processed
+    */
+   intel_measure_gather(&device->physical->measure_device,
+                        device->info);
+
+   assert(cmd_buffer->device != NULL);
+
+   measure->base.index = 0;
+//   measure->base.framebuffer = 0;
+   measure->base.frame = 0;
+   measure->base.event_count = 0;
+   list_inithead(&measure->base.link);
+}
+
+void
+anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_physical_device *physical = device->physical;
+
+   if (!config)
+      return;
+   if (measure == NULL)
+      return;
+
+   /* it is possible that the command buffer contains snapshots that have not
+    * yet been processed
+    */
+   intel_measure_gather(&physical->measure_device, &physical->info);
+
+   anv_device_release_bo(device, measure->bo);
+   vk_free(&cmd_buffer->vk.pool->alloc, measure);
+   cmd_buffer->measure = NULL;
+}
+
+static struct intel_measure_config*
+config_from_device(struct anv_device *device)
+{
+   return device->physical->measure_device.config;
+}
+
+void
+anv_measure_device_destroy(struct anv_physical_device *device)
+{
+   struct intel_measure_device *measure_device = &device->measure_device;
+   struct intel_measure_config *config = measure_device->config;
+
+   if (!config)
+      return;
+
+   if (measure_device->ringbuffer != NULL) {
+      vk_free(&device->instance->vk.alloc, measure_device->ringbuffer);
+      measure_device->ringbuffer = NULL;
+   }
+}
+
+/**
+ *  Hook for command buffer submission.
+ */
+void
+_anv_measure_submit(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+   struct intel_measure_device *measure_device = &cmd_buffer->device->physical->measure_device;
+
+   if (!config)
+      return;
+   if (measure == NULL)
+      return;
+
+   struct intel_measure_batch *base = &measure->base;
+   if (base->index == 0)
+      /* no snapshots were started */
+      return;
+
+   /* finalize snapshots and enqueue them */
+   static unsigned cmd_buffer_count = 0;
+   base->batch_count = p_atomic_inc_return(&cmd_buffer_count);
+
+   if (base->index %2 == 1) {
+      anv_measure_end_snapshot(cmd_buffer, base->event_count);
+      base->event_count = 0;
+   }
+
+   /* Mark the final timestamp as 'not completed'.  This marker will be used
+    * to verify that rendering is complete.
+    */
+   base->timestamps[base->index - 1] = 0;
+
+   /* add to the list of submitted snapshots */
+   pthread_mutex_lock(&measure_device->mutex);
+   list_addtail(&measure->base.link, &measure_device->queued_snapshots);
+   pthread_mutex_unlock(&measure_device->mutex);
+}
+
+/**
+ *  Hook for the start of a frame.
+ */
+void
+_anv_measure_acquire(struct anv_device *device)
+{
+   struct intel_measure_config *config = config_from_device(device);
+   struct intel_measure_device *measure_device = &device->physical->measure_device;
+
+   if (!config)
+      return;
+   if (measure_device == NULL)
+      return;
+
+   intel_measure_frame_transition(p_atomic_inc_return(&measure_device->frame));
+
+   /* iterate the queued snapshots and publish those that finished */
+   intel_measure_gather(measure_device, &device->physical->info);
+}
+
+void
+_anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+
+   if (!config)
+      return;
+   if (measure == NULL)
+      return;
+   if (measure->base.index % 2 == 0)
+      return;
+
+   anv_measure_end_snapshot(cmd_buffer, measure->base.event_count);
+   measure->base.event_count = 0;
+}
+
+void
+_anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   struct anv_measure_batch *measure = cmd_buffer->measure;
+
+   if (!config)
+      return;
+   if (measure == NULL)
+      return;
+
+//   if (measure->base.framebuffer == (uintptr_t) cmd_buffer->state.framebuffer)
+//      /* no change */
+//      return;
+
+   bool filtering = (config->flags & (INTEL_MEASURE_RENDERPASS |
+                                      INTEL_MEASURE_SHADER));
+   if (filtering && measure->base.index % 2 == 1) {
+      /* snapshot for previous renderpass was not ended */
+      anv_measure_end_snapshot(cmd_buffer,
+                               measure->base.event_count);
+      measure->base.event_count = 0;
+   }
+
+//   measure->base.framebuffer = (uintptr_t) cmd_buffer->state.framebuffer;
+}
+
+void
+_anv_measure_add_secondary(struct anv_cmd_buffer *primary,
+                           struct anv_cmd_buffer *secondary)
+{
+   struct intel_measure_config *config = config_from_command_buffer(primary);
+   struct anv_measure_batch *measure = primary->measure;
+   if (!config)
+      return;
+   if (measure == NULL)
+      return;
+   if (config->flags & (INTEL_MEASURE_BATCH | INTEL_MEASURE_FRAME))
+      /* secondary timing will be contained within the primary */
+      return;
+   if (secondary->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) {
+         static bool warned = false;
+         if (unlikely(!warned)) {
+            fprintf(config->file,
+                    "WARNING: INTEL_MEASURE cannot capture timings of commands "
+                    "in secondary command buffers with "
+                    "VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT set.\n");
+         }
+      return;
+   }
+
+   if (measure->base.index % 2 == 1)
+      anv_measure_end_snapshot(primary, measure->base.event_count);
+
+   struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[measure->base.index]);
+   _anv_measure_snapshot(primary, INTEL_SNAPSHOT_SECONDARY_BATCH, NULL, 0);
+
+   snapshot->secondary = &secondary->measure->base;
+}
diff --git a/src/intel/vulkan_hasvk/anv_measure.h b/src/intel/vulkan_hasvk/anv_measure.h
new file mode 100644
index 00000000000..a058a5ac51e
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_measure.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef ANV_MEASURE_H
+#define ANV_MEASURE_H
+
+#include "anv_private.h"
+#include "common/intel_measure.h"
+
+void anv_measure_device_init(struct anv_physical_device *device);
+void anv_measure_device_destroy(struct anv_physical_device *device);
+
+void anv_measure_init(struct anv_cmd_buffer *cmd_buffer);
+void anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer);
+void anv_measure_reset(struct anv_cmd_buffer *cmd_buffer);
+
+void _anv_measure_snapshot(struct anv_cmd_buffer *cmd_buffer,
+                           enum intel_measure_snapshot_type type,
+                           const char *event_name,
+                           uint32_t count);
+
+/* ends snapshots before command buffer submission */
+void _anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer);
+
+/* when measuring render passes, inserts a timestamp */
+void _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer);
+
+/* tracks frame progression */
+void _anv_measure_acquire(struct anv_device *device);
+
+/* should be combined with endcommandbuffer */
+void _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer);
+
+void
+_anv_measure_add_secondary(struct anv_cmd_buffer *primary,
+                           struct anv_cmd_buffer *secondary);
+
+#define anv_measure_acquire(device) \
+   if (unlikely(device->physical->measure_device.config)) \
+      _anv_measure_acquire(device)
+
+#define anv_measure_snapshot(cmd_buffer, type, event_name, count) \
+   if (unlikely(cmd_buffer->measure)) \
+      _anv_measure_snapshot(cmd_buffer, type, event_name, count)
+
+#define anv_measure_endcommandbuffer(cmd_buffer) \
+   if (unlikely(cmd_buffer->measure)) \
+      _anv_measure_endcommandbuffer(cmd_buffer)
+
+#define anv_measure_beginrenderpass(cmd_buffer) \
+   if (unlikely(cmd_buffer->measure)) \
+      _anv_measure_beginrenderpass(cmd_buffer)
+
+#define anv_measure_submit(cmd_buffer) \
+   if (unlikely(cmd_buffer->measure)) \
+      _anv_measure_submit(cmd_buffer)
+
+#define anv_measure_add_secondary(primary, secondary) \
+   if (unlikely(primary->measure)) \
+      _anv_measure_add_secondary(primary, secondary)
+
+#endif   /* ANV_MEASURE_H */
diff --git a/src/intel/vulkan_hasvk/anv_nir.h b/src/intel/vulkan_hasvk/anv_nir.h
new file mode 100644
index 00000000000..86705dfd4f6
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_NIR_H
+#define ANV_NIR_H
+
+#include "nir/nir.h"
+#include "anv_private.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool anv_check_for_primitive_replication(struct anv_device *device,
+                                         VkShaderStageFlags stages,
+                                         nir_shader **shaders,
+                                         uint32_t view_mask);
+
+bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+                             bool use_primitive_replication);
+
+bool anv_nir_lower_ycbcr_textures(nir_shader *shader,
+                                  const struct anv_pipeline_layout *layout);
+
+static inline nir_address_format
+anv_nir_ssbo_addr_format(const struct anv_physical_device *pdevice,
+                         bool robust_buffer_access)
+{
+   if (pdevice->has_a64_buffer_access) {
+      if (robust_buffer_access)
+         return nir_address_format_64bit_bounded_global;
+      else
+         return nir_address_format_64bit_global_32bit_offset;
+   } else {
+      return nir_address_format_32bit_index_offset;
+   }
+}
+
+static inline nir_address_format
+anv_nir_ubo_addr_format(const struct anv_physical_device *pdevice,
+                        bool robust_buffer_access)
+{
+   if (pdevice->has_a64_buffer_access) {
+      if (robust_buffer_access)
+         return nir_address_format_64bit_bounded_global;
+      else
+         return nir_address_format_64bit_global_32bit_offset;
+   } else {
+      return nir_address_format_32bit_index_offset;
+   }
+}
+
+bool anv_nir_lower_ubo_loads(nir_shader *shader);
+
+void anv_nir_apply_pipeline_layout(nir_shader *shader,
+                                   const struct anv_physical_device *pdevice,
+                                   bool robust_buffer_access,
+                                   const struct anv_pipeline_layout *layout,
+                                   struct anv_pipeline_bind_map *map);
+
+void anv_nir_compute_push_layout(nir_shader *nir,
+                                 const struct anv_physical_device *pdevice,
+                                 bool robust_buffer_access,
+                                 struct brw_stage_prog_data *prog_data,
+                                 struct anv_pipeline_bind_map *map,
+                                 void *mem_ctx);
+
+void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
+                                  struct anv_pipeline_bind_map *map);
+
+bool anv_nir_add_base_work_group_id(nir_shader *shader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ANV_NIR_H */
diff --git a/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c b/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c
new file mode 100644
index 00000000000..1283cb73eaa
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_add_base_work_group_id.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir/nir_builder.h"
+#include "compiler/brw_compiler.h"
+
+static bool
+anv_nir_add_base_work_group_id_instr(nir_builder *b,
+                                     nir_instr *instr,
+                                     UNUSED void *cb_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *load_id = nir_instr_as_intrinsic(instr);
+   if (load_id->intrinsic != nir_intrinsic_load_workgroup_id)
+      return false;
+
+   b->cursor = nir_after_instr(&load_id->instr);
+
+   nir_ssa_def *load_base =
+      nir_load_push_constant(b, 3, 32, nir_imm_int(b, 0),
+                             .base = offsetof(struct anv_push_constants, cs.base_work_group_id),
+                             .range = 3 * sizeof(uint32_t));
+
+   nir_ssa_def *id = nir_iadd(b, &load_id->dest.ssa, load_base);
+
+   nir_ssa_def_rewrite_uses_after(&load_id->dest.ssa, id, id->parent_instr);
+   return true;
+}
+
+bool
+anv_nir_add_base_work_group_id(nir_shader *shader)
+{
+   assert(shader->info.stage == MESA_SHADER_COMPUTE);
+
+   return nir_shader_instructions_pass(shader,
+                                       anv_nir_add_base_work_group_id_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c
new file mode 100644
index 00000000000..0dec0744516
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_apply_pipeline_layout.c
@@ -0,0 +1,1686 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "program/prog_parameter.h"
+#include "nir/nir_builder.h"
+#include "compiler/brw_nir.h"
+#include "util/mesa-sha1.h"
+#include "util/set.h"
+
+/* Sampler tables don't actually have a maximum size but we pick one just so
+ * that we don't end up emitting too much state on-the-fly.
+ */
+#define MAX_SAMPLER_TABLE_SIZE 128
+#define BINDLESS_OFFSET        255
+
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+struct apply_pipeline_layout_state {
+   const struct anv_physical_device *pdevice;
+
+   const struct anv_pipeline_layout *layout;
+   bool add_bounds_checks;
+   nir_address_format desc_addr_format;
+   nir_address_format ssbo_addr_format;
+   nir_address_format ubo_addr_format;
+
+   /* Place to flag lowered instructions so we don't lower them twice */
+   struct set *lowered_instrs;
+
+   bool uses_constants;
+   bool has_dynamic_buffers;
+   uint8_t constants_offset;
+   struct {
+      bool desc_buffer_used;
+      uint8_t desc_offset;
+
+      uint8_t *use_count;
+      uint8_t *surface_offsets;
+      uint8_t *sampler_offsets;
+   } set[MAX_SETS];
+};
+
+static nir_address_format
+addr_format_for_desc_type(VkDescriptorType desc_type,
+                          struct apply_pipeline_layout_state *state)
+{
+   switch (desc_type) {
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      return state->ssbo_addr_format;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      return state->ubo_addr_format;
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+      return state->desc_addr_format;
+
+   default:
+      unreachable("Unsupported descriptor type");
+   }
+}
+
+static void
+add_binding(struct apply_pipeline_layout_state *state,
+            uint32_t set, uint32_t binding)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   if (state->set[set].use_count[binding] < UINT8_MAX)
+      state->set[set].use_count[binding]++;
+
+   /* Only flag the descriptor buffer as used if there's actually data for
+    * this binding.  This lets us be lazy and call this function constantly
+    * without worrying about unnecessarily enabling the buffer.
+    */
+   if (bind_layout->descriptor_stride)
+      state->set[set].desc_buffer_used = true;
+}
+
+static void
+add_deref_src_binding(struct apply_pipeline_layout_state *state, nir_src src)
+{
+   nir_deref_instr *deref = nir_src_as_deref(src);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   add_binding(state, var->data.descriptor_set, var->data.binding);
+}
+
+static void
+add_tex_src_binding(struct apply_pipeline_layout_state *state,
+                    nir_tex_instr *tex, nir_tex_src_type deref_src_type)
+{
+   int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
+   if (deref_src_idx < 0)
+      return;
+
+   add_deref_src_binding(state, tex->src[deref_src_idx].src);
+}
+
+static bool
+get_used_bindings(UNUSED nir_builder *_b, nir_instr *instr, void *_state)
+{
+   struct apply_pipeline_layout_state *state = _state;
+
+   switch (instr->type) {
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_vulkan_resource_index:
+         add_binding(state, nir_intrinsic_desc_set(intrin),
+                     nir_intrinsic_binding(intrin));
+         break;
+
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_imin:
+      case nir_intrinsic_image_deref_atomic_umin:
+      case nir_intrinsic_image_deref_atomic_imax:
+      case nir_intrinsic_image_deref_atomic_umax:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_atomic_fadd:
+      case nir_intrinsic_image_deref_size:
+      case nir_intrinsic_image_deref_samples:
+      case nir_intrinsic_image_deref_load_param_intel:
+      case nir_intrinsic_image_deref_load_raw_intel:
+      case nir_intrinsic_image_deref_store_raw_intel:
+         add_deref_src_binding(state, intrin->src[0]);
+         break;
+
+      case nir_intrinsic_load_constant:
+         state->uses_constants = true;
+         break;
+
+      default:
+         break;
+      }
+      break;
+   }
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      add_tex_src_binding(state, tex, nir_tex_src_texture_deref);
+      add_tex_src_binding(state, tex, nir_tex_src_sampler_deref);
+      break;
+   }
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static nir_intrinsic_instr *
+find_descriptor_for_index_src(nir_src src,
+                              struct apply_pipeline_layout_state *state)
+{
+   nir_intrinsic_instr *intrin = nir_src_as_intrinsic(src);
+
+   while (intrin && intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex)
+      intrin = nir_src_as_intrinsic(intrin->src[0]);
+
+   if (!intrin || intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
+      return NULL;
+
+   return intrin;
+}
+
+static bool
+descriptor_has_bti(nir_intrinsic_instr *intrin,
+                   struct apply_pipeline_layout_state *state)
+{
+   assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   uint32_t surface_index;
+   if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM)
+      surface_index = state->set[set].desc_offset;
+   else
+      surface_index = state->set[set].surface_offsets[binding];
+
+   /* Only lower to a BTI message if we have a valid binding table index. */
+   return surface_index < MAX_BINDING_TABLE_SIZE;
+}
+
+static nir_address_format
+descriptor_address_format(nir_intrinsic_instr *intrin,
+                          struct apply_pipeline_layout_state *state)
+{
+   assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
+
+   return addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
+}
+
+static nir_intrinsic_instr *
+nir_deref_find_descriptor(nir_deref_instr *deref,
+                          struct apply_pipeline_layout_state *state)
+{
+   while (1) {
+      /* Nothing we will use this on has a variable */
+      assert(deref->deref_type != nir_deref_type_var);
+
+      nir_deref_instr *parent = nir_src_as_deref(deref->parent);
+      if (!parent)
+         break;
+
+      deref = parent;
+   }
+   assert(deref->deref_type == nir_deref_type_cast);
+
+   nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent);
+   if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
+      return false;
+
+   return find_descriptor_for_index_src(intrin->src[0], state);
+}
+
+static nir_ssa_def *
+build_load_descriptor_mem(nir_builder *b,
+                          nir_ssa_def *desc_addr, unsigned desc_offset,
+                          unsigned num_components, unsigned bit_size,
+                          struct apply_pipeline_layout_state *state)
+
+{
+   switch (state->desc_addr_format) {
+   case nir_address_format_64bit_global_32bit_offset: {
+      nir_ssa_def *base_addr =
+         nir_pack_64_2x32(b, nir_channels(b, desc_addr, 0x3));
+      nir_ssa_def *offset32 =
+         nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset);
+
+      return nir_load_global_constant_offset(b, num_components, bit_size,
+                                             base_addr, offset32,
+                                             .align_mul = 8,
+                                             .align_offset = desc_offset % 8);
+   }
+
+   case nir_address_format_32bit_index_offset: {
+      nir_ssa_def *surface_index = nir_channel(b, desc_addr, 0);
+      nir_ssa_def *offset32 =
+         nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset);
+
+      return nir_load_ubo(b, num_components, bit_size,
+                          surface_index, offset32,
+                          .align_mul = 8,
+                          .align_offset = desc_offset % 8,
+                          .range_base = 0,
+                          .range = ~0);
+   }
+
+   default:
+      unreachable("Unsupported address format");
+   }
+}
+
+/** Build a Vulkan resource index
+ *
+ * A "resource index" is the term used by our SPIR-V parser and the relevant
+ * NIR intrinsics for a reference into a descriptor set.  It acts much like a
+ * deref in NIR except that it accesses opaque descriptors instead of memory.
+ *
+ * Coming out of SPIR-V, both the resource indices (in the form of
+ * vulkan_resource_[re]index intrinsics) and the memory derefs (in the form
+ * of nir_deref_instr) use the same vector component/bit size.  The meaning
+ * of those values for memory derefs (nir_deref_instr) is given by the
+ * nir_address_format associated with the descriptor type.  For resource
+ * indices, it's an entirely internal to ANV encoding which describes, in some
+ * sense, the address of the descriptor.  Thanks to the NIR/SPIR-V rules, it
+ * must be packed into the same size SSA values as a memory address.  For this
+ * reason, the actual encoding may depend both on the address format for
+ * memory derefs and the descriptor address format.
+ *
+ * The load_vulkan_descriptor intrinsic exists to provide a transition point
+ * between these two forms of derefs: descriptor and memory.
+ */
+static nir_ssa_def *
+build_res_index(nir_builder *b, uint32_t set, uint32_t binding,
+                nir_ssa_def *array_index, nir_address_format addr_format,
+                struct apply_pipeline_layout_state *state)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   uint32_t array_size = bind_layout->array_size;
+
+   switch (addr_format) {
+   case nir_address_format_64bit_global_32bit_offset:
+   case nir_address_format_64bit_bounded_global: {
+      uint32_t set_idx;
+      switch (state->desc_addr_format) {
+      case nir_address_format_64bit_global_32bit_offset:
+         set_idx = set;
+         break;
+
+      case nir_address_format_32bit_index_offset:
+         assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE);
+         set_idx = state->set[set].desc_offset;
+         break;
+
+      default:
+         unreachable("Unsupported address format");
+      }
+
+      assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
+      uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */
+      if (bind_layout->dynamic_offset_index >= 0) {
+         dynamic_offset_index =
+            state->layout->set[set].dynamic_offset_start +
+            bind_layout->dynamic_offset_index;
+      }
+
+      const uint32_t packed = (bind_layout->descriptor_stride << 16 ) | (set_idx << 8) | dynamic_offset_index;
+
+      return nir_vec4(b, nir_imm_int(b, packed),
+                         nir_imm_int(b, bind_layout->descriptor_offset),
+                         nir_imm_int(b, array_size - 1),
+                         array_index);
+   }
+
+   case nir_address_format_32bit_index_offset: {
+      assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
+      if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         uint32_t surface_index = state->set[set].desc_offset;
+         return nir_imm_ivec2(b, surface_index,
+                                 bind_layout->descriptor_offset);
+      } else {
+         uint32_t surface_index = state->set[set].surface_offsets[binding];
+         assert(array_size > 0 && array_size <= UINT16_MAX);
+         assert(surface_index <= UINT16_MAX);
+         uint32_t packed = ((array_size - 1) << 16) | surface_index;
+         return nir_vec2(b, array_index, nir_imm_int(b, packed));
+      }
+   }
+
+   default:
+      unreachable("Unsupported address format");
+   }
+}
+
+struct res_index_defs {
+   nir_ssa_def *set_idx;
+   nir_ssa_def *dyn_offset_base;
+   nir_ssa_def *desc_offset_base;
+   nir_ssa_def *array_index;
+   nir_ssa_def *desc_stride;
+};
+
+static struct res_index_defs
+unpack_res_index(nir_builder *b, nir_ssa_def *index)
+{
+   struct res_index_defs defs;
+
+   nir_ssa_def *packed = nir_channel(b, index, 0);
+   defs.desc_stride = nir_extract_u8(b, packed, nir_imm_int(b, 2));
+   defs.set_idx = nir_extract_u8(b, packed, nir_imm_int(b, 1));
+   defs.dyn_offset_base = nir_extract_u8(b, packed, nir_imm_int(b, 0));
+
+   defs.desc_offset_base = nir_channel(b, index, 1);
+   defs.array_index = nir_umin(b, nir_channel(b, index, 2),
+                                  nir_channel(b, index, 3));
+
+   return defs;
+}
+
+/** Adjust a Vulkan resource index
+ *
+ * This is the equivalent of nir_deref_type_ptr_as_array for resource indices.
+ * For array descriptors, it allows us to adjust the array index.  Thanks to
+ * variable pointers, we cannot always fold this re-index operation into the
+ * vulkan_resource_index intrinsic and we have to do it based on nothing but
+ * the address format.
+ */
+static nir_ssa_def *
+build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta,
+                  nir_address_format addr_format)
+{
+   switch (addr_format) {
+   case nir_address_format_64bit_global_32bit_offset:
+   case nir_address_format_64bit_bounded_global:
+      return nir_vec4(b, nir_channel(b, orig, 0),
+                         nir_channel(b, orig, 1),
+                         nir_channel(b, orig, 2),
+                         nir_iadd(b, nir_channel(b, orig, 3), delta));
+
+   case nir_address_format_32bit_index_offset:
+      return nir_vec2(b, nir_iadd(b, nir_channel(b, orig, 0), delta),
+                         nir_channel(b, orig, 1));
+
+   default:
+      unreachable("Unhandled address format");
+   }
+}
+
+/** Get the address for a descriptor given its resource index
+ *
+ * Because of the re-indexing operations, we can't bounds check descriptor
+ * array access until we have the final index.  That means we end up doing the
+ * bounds check here, if needed.  See unpack_res_index() for more details.
+ *
+ * This function takes both a bind_layout and a desc_type which are used to
+ * determine the descriptor stride for array descriptors.  The bind_layout is
+ * optional for buffer descriptor types.
+ */
+static nir_ssa_def *
+build_desc_addr(nir_builder *b,
+                const struct anv_descriptor_set_binding_layout *bind_layout,
+                const VkDescriptorType desc_type,
+                nir_ssa_def *index, nir_address_format addr_format,
+                struct apply_pipeline_layout_state *state)
+{
+   switch (addr_format) {
+   case nir_address_format_64bit_global_32bit_offset:
+   case nir_address_format_64bit_bounded_global: {
+      struct res_index_defs res = unpack_res_index(b, index);
+
+      nir_ssa_def *desc_offset = res.desc_offset_base;
+      if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         /* Compute the actual descriptor offset.  For inline uniform blocks,
+          * the array index is ignored as they are only allowed to be a single
+          * descriptor (not an array) and there is no concept of a "stride".
+          *
+          */
+         desc_offset =
+            nir_iadd(b, desc_offset, nir_imul(b, res.array_index, res.desc_stride));
+      }
+
+      switch (state->desc_addr_format) {
+      case nir_address_format_64bit_global_32bit_offset: {
+         nir_ssa_def *base_addr =
+            nir_load_desc_set_address_intel(b, res.set_idx);
+         return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
+                            nir_unpack_64_2x32_split_y(b, base_addr),
+                            nir_imm_int(b, UINT32_MAX),
+                            desc_offset);
+      }
+
+      case nir_address_format_32bit_index_offset:
+         return nir_vec2(b, res.set_idx, desc_offset);
+
+      default:
+         unreachable("Unhandled address format");
+      }
+   }
+
+   case nir_address_format_32bit_index_offset:
+      assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
+      assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
+      return index;
+
+   default:
+      unreachable("Unhandled address format");
+   }
+}
+
+/** Convert a Vulkan resource index into a buffer address
+ *
+ * In some cases, this does a  memory load from the descriptor set and, in
+ * others, it simply converts from one form to another.
+ *
+ * See build_res_index for details about each resource index format.
+ */
+static nir_ssa_def *
+build_buffer_addr_for_res_index(nir_builder *b,
+                                const VkDescriptorType desc_type,
+                                nir_ssa_def *res_index,
+                                nir_address_format addr_format,
+                                struct apply_pipeline_layout_state *state)
+{
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+      assert(addr_format == state->desc_addr_format);
+      return build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+   } else if (addr_format == nir_address_format_32bit_index_offset) {
+      nir_ssa_def *array_index = nir_channel(b, res_index, 0);
+      nir_ssa_def *packed = nir_channel(b, res_index, 1);
+      nir_ssa_def *array_max = nir_extract_u16(b, packed, nir_imm_int(b, 1));
+      nir_ssa_def *surface_index = nir_extract_u16(b, packed, nir_imm_int(b, 0));
+
+      if (state->add_bounds_checks)
+         array_index = nir_umin(b, array_index, array_max);
+
+      return nir_vec2(b, nir_iadd(b, surface_index, array_index),
+                         nir_imm_int(b, 0));
+   }
+
+   nir_ssa_def *desc_addr =
+      build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+
+   nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
+
+   if (state->has_dynamic_buffers) {
+      struct res_index_defs res = unpack_res_index(b, res_index);
+
+      /* This shader has dynamic offsets and we have no way of knowing
+       * (save from the dynamic offset base index) if this buffer has a
+       * dynamic offset.
+       */
+      nir_ssa_def *dyn_offset_idx =
+         nir_iadd(b, res.dyn_offset_base, res.array_index);
+      if (state->add_bounds_checks) {
+         dyn_offset_idx = nir_umin(b, dyn_offset_idx,
+                                      nir_imm_int(b, MAX_DYNAMIC_BUFFERS));
+      }
+
+      nir_ssa_def *dyn_load =
+         nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4),
+                                .base = offsetof(struct anv_push_constants, dynamic_offsets),
+                                .range = MAX_DYNAMIC_BUFFERS * 4);
+
+      nir_ssa_def *dynamic_offset =
+         nir_bcsel(b, nir_ieq_imm(b, res.dyn_offset_base, 0xff),
+                      nir_imm_int(b, 0), dyn_load);
+
+      /* The dynamic offset gets added to the base pointer so that we
+       * have a sliding window range.
+       */
+      nir_ssa_def *base_ptr =
+         nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+      base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
+      desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
+                         nir_unpack_64_2x32_split_y(b, base_ptr),
+                         nir_channel(b, desc, 2),
+                         nir_channel(b, desc, 3));
+   }
+
+   /* The last element of the vec4 is always zero.
+    *
+    * See also struct anv_address_range_descriptor
+    */
+   return nir_vec4(b, nir_channel(b, desc, 0),
+                      nir_channel(b, desc, 1),
+                      nir_channel(b, desc, 2),
+                      nir_imm_int(b, 0));
+}
+
+/** Loads descriptor memory for a variable-based deref chain
+ *
+ * The deref chain has to terminate at a variable with a descriptor_set and
+ * binding set.  This is used for images, textures, and samplers.
+ */
+static nir_ssa_def *
+build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref,
+                                    unsigned desc_offset,
+                                    unsigned num_components, unsigned bit_size,
+                                    struct apply_pipeline_layout_state *state)
+{
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   const uint32_t set = var->data.descriptor_set;
+   const uint32_t binding = var->data.binding;
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+         &state->layout->set[set].layout->binding[binding];
+
+   nir_ssa_def *array_index;
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+      assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var);
+      assert(deref->arr.index.is_ssa);
+      array_index = deref->arr.index.ssa;
+   } else {
+      array_index = nir_imm_int(b, 0);
+   }
+
+   /* It doesn't really matter what address format we choose as everything
+    * will constant-fold nicely.  Choose one that uses the actual descriptor
+    * buffer so we don't run into issues index/offset assumptions.
+    */
+   const nir_address_format addr_format =
+      nir_address_format_64bit_bounded_global;
+
+   nir_ssa_def *res_index =
+      build_res_index(b, set, binding, array_index, addr_format, state);
+
+   nir_ssa_def *desc_addr =
+      build_desc_addr(b, bind_layout, bind_layout->type,
+                      res_index, addr_format, state);
+
+   return build_load_descriptor_mem(b, desc_addr, desc_offset,
+                                    num_components, bit_size, state);
+}
+
+/** A recursive form of build_res_index()
+ *
+ * This recursively walks a resource [re]index chain and builds the resource
+ * index.  It places the new code with the resource [re]index operation in the
+ * hopes of better CSE.  This means the cursor is not where you left it when
+ * this function returns.
+ */
+static nir_ssa_def *
+build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
+                          nir_address_format addr_format,
+                          uint32_t *set, uint32_t *binding,
+                          struct apply_pipeline_layout_state *state)
+{
+   if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
+      b->cursor = nir_before_instr(&intrin->instr);
+      assert(intrin->src[0].is_ssa);
+      *set = nir_intrinsic_desc_set(intrin);
+      *binding = nir_intrinsic_binding(intrin);
+      return build_res_index(b, *set, *binding, intrin->src[0].ssa,
+                             addr_format, state);
+   } else {
+      assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex);
+      nir_intrinsic_instr *parent = nir_src_as_intrinsic(intrin->src[0]);
+      nir_ssa_def *index =
+         build_res_index_for_chain(b, parent, addr_format,
+                                   set, binding, state);
+
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      assert(intrin->src[1].is_ssa);
+      return build_res_reindex(b, index, intrin->src[1].ssa, addr_format);
+   }
+}
+
+/** Builds a buffer address for a given vulkan [re]index intrinsic
+ *
+ * The cursor is not where you left it when this function returns.
+ */
+static nir_ssa_def *
+build_buffer_addr_for_idx_intrin(nir_builder *b,
+                                 nir_intrinsic_instr *idx_intrin,
+                                 nir_address_format addr_format,
+                                 struct apply_pipeline_layout_state *state)
+{
+   uint32_t set = UINT32_MAX, binding = UINT32_MAX;
+   nir_ssa_def *res_index =
+      build_res_index_for_chain(b, idx_intrin, addr_format,
+                                &set, &binding, state);
+
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   return build_buffer_addr_for_res_index(b, bind_layout->type,
+                                          res_index, addr_format, state);
+}
+
+/** Builds a buffer address for deref chain
+ *
+ * This assumes that you can chase the chain all the way back to the original
+ * vulkan_resource_index intrinsic.
+ *
+ * The cursor is not where you left it when this function returns.
+ */
+static nir_ssa_def *
+build_buffer_addr_for_deref(nir_builder *b, nir_deref_instr *deref,
+                            nir_address_format addr_format,
+                            struct apply_pipeline_layout_state *state)
+{
+   nir_deref_instr *parent = nir_deref_instr_parent(deref);
+   if (parent) {
+      nir_ssa_def *addr =
+         build_buffer_addr_for_deref(b, parent, addr_format, state);
+
+      b->cursor = nir_before_instr(&deref->instr);
+      return nir_explicit_io_address_from_deref(b, deref, addr, addr_format);
+   }
+
+   nir_intrinsic_instr *load_desc = nir_src_as_intrinsic(deref->parent);
+   assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor);
+
+   nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(load_desc->src[0]);
+
+   b->cursor = nir_before_instr(&deref->instr);
+
+   return build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state);
+}
+
+static bool
+try_lower_direct_buffer_intrinsic(nir_builder *b,
+                                  nir_intrinsic_instr *intrin, bool is_atomic,
+                                  struct apply_pipeline_layout_state *state)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   if (!nir_deref_mode_is_one_of(deref, nir_var_mem_ubo | nir_var_mem_ssbo))
+      return false;
+
+   nir_intrinsic_instr *desc = nir_deref_find_descriptor(deref, state);
+   if (desc == NULL) {
+      /* We should always be able to find the descriptor for UBO access. */
+      assert(nir_deref_mode_is_one_of(deref, nir_var_mem_ssbo));
+      return false;
+   }
+
+   nir_address_format addr_format = descriptor_address_format(desc, state);
+
+   if (nir_deref_mode_is(deref, nir_var_mem_ssbo)) {
+      /* 64-bit atomics only support A64 messages so we can't lower them to
+       * the index+offset model.
+       */
+      if (is_atomic && nir_dest_bit_size(intrin->dest) == 64 &&
+          !state->pdevice->info.has_lsc)
+         return false;
+
+      /* Normal binding table-based messages can't handle non-uniform access
+       * so we have to fall back to A64.
+       */
+      if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)
+         return false;
+
+      if (!descriptor_has_bti(desc, state))
+         return false;
+
+      /* Rewrite to 32bit_index_offset whenever we can */
+      addr_format = nir_address_format_32bit_index_offset;
+   } else {
+      assert(nir_deref_mode_is(deref, nir_var_mem_ubo));
+
+      /* Rewrite to 32bit_index_offset whenever we can */
+      if (descriptor_has_bti(desc, state))
+         addr_format = nir_address_format_32bit_index_offset;
+   }
+
+   nir_ssa_def *addr =
+      build_buffer_addr_for_deref(b, deref, addr_format, state);
+
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_lower_explicit_io_instr(b, intrin, addr, addr_format);
+
+   return true;
+}
+
+static bool
+lower_load_accel_struct_desc(nir_builder *b,
+                             nir_intrinsic_instr *load_desc,
+                             struct apply_pipeline_layout_state *state)
+{
+   assert(load_desc->intrinsic == nir_intrinsic_load_vulkan_descriptor);
+
+   nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(load_desc->src[0]);
+
+   /* It doesn't really matter what address format we choose as
+    * everything will constant-fold nicely.  Choose one that uses the
+    * actual descriptor buffer.
+    */
+   const nir_address_format addr_format =
+      nir_address_format_64bit_bounded_global;
+
+   uint32_t set = UINT32_MAX, binding = UINT32_MAX;
+   nir_ssa_def *res_index =
+      build_res_index_for_chain(b, idx_intrin, addr_format,
+                                &set, &binding, state);
+
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   b->cursor = nir_before_instr(&load_desc->instr);
+
+   nir_ssa_def *desc_addr =
+      build_desc_addr(b, bind_layout, bind_layout->type,
+                      res_index, addr_format, state);
+
+   /* Acceleration structure descriptors are always uint64_t */
+   nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state);
+
+   assert(load_desc->dest.is_ssa);
+   assert(load_desc->dest.ssa.bit_size == 64);
+   assert(load_desc->dest.ssa.num_components == 1);
+   nir_ssa_def_rewrite_uses(&load_desc->dest.ssa, desc);
+   nir_instr_remove(&load_desc->instr);
+
+   return true;
+}
+
+static bool
+lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state)
+{
+   struct apply_pipeline_layout_state *state = _state;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:
+      return try_lower_direct_buffer_intrinsic(b, intrin, false, state);
+
+   case nir_intrinsic_deref_atomic_add:
+   case nir_intrinsic_deref_atomic_imin:
+   case nir_intrinsic_deref_atomic_umin:
+   case nir_intrinsic_deref_atomic_imax:
+   case nir_intrinsic_deref_atomic_umax:
+   case nir_intrinsic_deref_atomic_and:
+   case nir_intrinsic_deref_atomic_or:
+   case nir_intrinsic_deref_atomic_xor:
+   case nir_intrinsic_deref_atomic_exchange:
+   case nir_intrinsic_deref_atomic_comp_swap:
+   case nir_intrinsic_deref_atomic_fadd:
+   case nir_intrinsic_deref_atomic_fmin:
+   case nir_intrinsic_deref_atomic_fmax:
+   case nir_intrinsic_deref_atomic_fcomp_swap:
+      return try_lower_direct_buffer_intrinsic(b, intrin, true, state);
+
+   case nir_intrinsic_get_ssbo_size: {
+      /* The get_ssbo_size intrinsic always just takes a
+       * index/reindex intrinsic.
+       */
+      nir_intrinsic_instr *idx_intrin =
+         find_descriptor_for_index_src(intrin->src[0], state);
+      if (idx_intrin == NULL || !descriptor_has_bti(idx_intrin, state))
+         return false;
+
+      b->cursor = nir_before_instr(&intrin->instr);
+
+      /* We just checked that this is a BTI descriptor */
+      const nir_address_format addr_format =
+         nir_address_format_32bit_index_offset;
+
+      nir_ssa_def *buffer_addr =
+         build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state);
+
+      b->cursor = nir_before_instr(&intrin->instr);
+      nir_ssa_def *bti = nir_channel(b, buffer_addr, 0);
+
+      nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                            nir_src_for_ssa(bti));
+      _mesa_set_add(state->lowered_instrs, intrin);
+      return true;
+   }
+
+   case nir_intrinsic_load_vulkan_descriptor:
+      if (nir_intrinsic_desc_type(intrin) ==
+          VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR)
+         return lower_load_accel_struct_desc(b, intrin, state);
+      return false;
+
+   default:
+      return false;
+   }
+}
+
+static bool
+lower_res_index_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
+                          struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_address_format addr_format =
+      addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
+
+   assert(intrin->src[0].is_ssa);
+   nir_ssa_def *index =
+      build_res_index(b, nir_intrinsic_desc_set(intrin),
+                         nir_intrinsic_binding(intrin),
+                         intrin->src[0].ssa,
+                         addr_format, state);
+
+   assert(intrin->dest.is_ssa);
+   assert(intrin->dest.ssa.bit_size == index->bit_size);
+   assert(intrin->dest.ssa.num_components == index->num_components);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+lower_res_reindex_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
+                            struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_address_format addr_format =
+      addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
+
+   assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa);
+   nir_ssa_def *index =
+      build_res_reindex(b, intrin->src[0].ssa,
+                           intrin->src[1].ssa,
+                           addr_format);
+
+   assert(intrin->dest.is_ssa);
+   assert(intrin->dest.ssa.bit_size == index->bit_size);
+   assert(intrin->dest.ssa.num_components == index->num_components);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
+                             struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+   nir_address_format addr_format = addr_format_for_desc_type(desc_type, state);
+
+   assert(intrin->dest.is_ssa);
+   nir_foreach_use(src, &intrin->dest.ssa) {
+      if (src->parent_instr->type != nir_instr_type_deref)
+         continue;
+
+      nir_deref_instr *cast = nir_instr_as_deref(src->parent_instr);
+      assert(cast->deref_type == nir_deref_type_cast);
+      switch (desc_type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         cast->cast.align_mul = ANV_UBO_ALIGNMENT;
+         cast->cast.align_offset = 0;
+         break;
+
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         cast->cast.align_mul = ANV_SSBO_ALIGNMENT;
+         cast->cast.align_offset = 0;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   assert(intrin->src[0].is_ssa);
+   nir_ssa_def *desc =
+      build_buffer_addr_for_res_index(b, desc_type, intrin->src[0].ssa,
+                                      addr_format, state);
+
+   assert(intrin->dest.is_ssa);
+   assert(intrin->dest.ssa.bit_size == desc->bit_size);
+   assert(intrin->dest.ssa.num_components == desc->num_components);
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+   nir_instr_remove(&intrin->instr);
+
+   return true;
+}
+
+static bool
+lower_get_ssbo_size(nir_builder *b, nir_intrinsic_instr *intrin,
+                    struct apply_pipeline_layout_state *state)
+{
+   if (_mesa_set_search(state->lowered_instrs, intrin))
+      return false;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_address_format addr_format =
+      addr_format_for_desc_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, state);
+
+   assert(intrin->src[0].is_ssa);
+   nir_ssa_def *desc =
+      build_buffer_addr_for_res_index(b, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+                                      intrin->src[0].ssa, addr_format, state);
+
+   switch (addr_format) {
+   case nir_address_format_64bit_global_32bit_offset:
+   case nir_address_format_64bit_bounded_global: {
+      nir_ssa_def *size = nir_channel(b, desc, 2);
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, size);
+      nir_instr_remove(&intrin->instr);
+      break;
+   }
+
+   case nir_address_format_32bit_index_offset:
+      /* The binding table index is the first component of the address.  The
+       * back-end wants a scalar binding table index source.
+       */
+      nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                            nir_src_for_ssa(nir_channel(b, desc, 0)));
+      break;
+
+   default:
+      unreachable("Unsupported address format");
+   }
+
+   return true;
+}
+
+static bool
+image_binding_needs_lowered_surface(nir_variable *var)
+{
+   return !(var->data.access & ACCESS_NON_READABLE) &&
+          var->data.image.format != PIPE_FORMAT_NONE;
+}
+
+static bool
+lower_image_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
+                      struct apply_pipeline_layout_state *state)
+{
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   unsigned binding_offset = state->set[set].surface_offsets[binding];
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   ASSERTED const bool use_bindless = state->pdevice->has_bindless_images;
+
+   if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) {
+      b->cursor = nir_instr_remove(&intrin->instr);
+
+      assert(!use_bindless); /* Otherwise our offsets would be wrong */
+      const unsigned param = nir_intrinsic_base(intrin);
+
+      nir_ssa_def *desc =
+         build_load_var_deref_descriptor_mem(b, deref, param * 16,
+                                             intrin->dest.ssa.num_components,
+                                             intrin->dest.ssa.bit_size, state);
+
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+   } else if (binding_offset > MAX_BINDING_TABLE_SIZE) {
+      const unsigned desc_comp =
+         image_binding_needs_lowered_surface(var) ? 1 : 0;
+      nir_ssa_def *desc =
+         build_load_var_deref_descriptor_mem(b, deref, 0, 2, 32, state);
+      nir_ssa_def *handle = nir_channel(b, desc, desc_comp);
+      nir_rewrite_image_intrinsic(intrin, handle, true);
+   } else {
+      unsigned array_size =
+         state->layout->set[set].layout->binding[binding].array_size;
+
+      nir_ssa_def *index = NULL;
+      if (deref->deref_type != nir_deref_type_var) {
+         assert(deref->deref_type == nir_deref_type_array);
+         index = nir_ssa_for_src(b, deref->arr.index, 1);
+         if (state->add_bounds_checks)
+            index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
+      } else {
+         index = nir_imm_int(b, 0);
+      }
+
+      index = nir_iadd_imm(b, index, binding_offset);
+      nir_rewrite_image_intrinsic(intrin, index, false);
+   }
+
+   return true;
+}
+
+static bool
+lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
+                    struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   /* Any constant-offset load_constant instructions should have been removed
+    * by constant folding.
+    */
+   assert(!nir_src_is_const(intrin->src[0]));
+   nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, intrin->src[0], 1),
+                                      nir_intrinsic_base(intrin));
+
+   nir_ssa_def *data;
+   if (!anv_use_relocations(state->pdevice)) {
+      unsigned load_size = intrin->dest.ssa.num_components *
+                           intrin->dest.ssa.bit_size / 8;
+      unsigned load_align = intrin->dest.ssa.bit_size / 8;
+
+      assert(load_size < b->shader->constant_data_size);
+      unsigned max_offset = b->shader->constant_data_size - load_size;
+      offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
+
+      nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(b,
+         nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
+         nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+
+      data = nir_load_global_constant(b, nir_iadd(b, const_data_base_addr,
+                                                     nir_u2u64(b, offset)),
+                                      load_align,
+                                      intrin->dest.ssa.num_components,
+                                      intrin->dest.ssa.bit_size);
+   } else {
+      nir_ssa_def *index = nir_imm_int(b, state->constants_offset);
+
+      data = nir_load_ubo(b, intrin->num_components, intrin->dest.ssa.bit_size,
+                          index, offset,
+                          .align_mul = intrin->dest.ssa.bit_size / 8,
+                          .align_offset =  0,
+                          .range_base = nir_intrinsic_base(intrin),
+                          .range = nir_intrinsic_range(intrin));
+   }
+
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, data);
+
+   return true;
+}
+
+static void
+lower_tex_deref(nir_builder *b, nir_tex_instr *tex,
+                nir_tex_src_type deref_src_type,
+                unsigned *base_index, unsigned plane,
+                struct apply_pipeline_layout_state *state)
+{
+   int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
+   if (deref_src_idx < 0)
+      return;
+
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   unsigned array_size =
+      state->layout->set[set].layout->binding[binding].array_size;
+
+   unsigned binding_offset;
+   if (deref_src_type == nir_tex_src_texture_deref) {
+      binding_offset = state->set[set].surface_offsets[binding];
+   } else {
+      assert(deref_src_type == nir_tex_src_sampler_deref);
+      binding_offset = state->set[set].sampler_offsets[binding];
+   }
+
+   nir_tex_src_type offset_src_type;
+   nir_ssa_def *index = NULL;
+   if (binding_offset > MAX_BINDING_TABLE_SIZE) {
+      const unsigned plane_offset =
+         plane * sizeof(struct anv_sampled_image_descriptor);
+
+      nir_ssa_def *desc =
+         build_load_var_deref_descriptor_mem(b, deref, plane_offset,
+                                             2, 32, state);
+
+      if (deref_src_type == nir_tex_src_texture_deref) {
+         offset_src_type = nir_tex_src_texture_handle;
+         index = nir_channel(b, desc, 0);
+      } else {
+         assert(deref_src_type == nir_tex_src_sampler_deref);
+         offset_src_type = nir_tex_src_sampler_handle;
+         index = nir_channel(b, desc, 1);
+      }
+   } else {
+      if (deref_src_type == nir_tex_src_texture_deref) {
+         offset_src_type = nir_tex_src_texture_offset;
+      } else {
+         assert(deref_src_type == nir_tex_src_sampler_deref);
+         offset_src_type = nir_tex_src_sampler_offset;
+      }
+
+      *base_index = binding_offset + plane;
+
+      if (deref->deref_type != nir_deref_type_var) {
+         assert(deref->deref_type == nir_deref_type_array);
+
+         if (nir_src_is_const(deref->arr.index)) {
+            unsigned arr_index = MIN2(nir_src_as_uint(deref->arr.index), array_size - 1);
+            struct anv_sampler **immutable_samplers =
+               state->layout->set[set].layout->binding[binding].immutable_samplers;
+            if (immutable_samplers) {
+               /* Array of YCbCr samplers are tightly packed in the binding
+                * tables, compute the offset of an element in the array by
+                * adding the number of planes of all preceding elements.
+                */
+               unsigned desc_arr_index = 0;
+               for (int i = 0; i < arr_index; i++)
+                  desc_arr_index += immutable_samplers[i]->n_planes;
+               *base_index += desc_arr_index;
+            } else {
+               *base_index += arr_index;
+            }
+         } else {
+            /* From VK_KHR_sampler_ycbcr_conversion:
+             *
+             * If sampler Y’CBCR conversion is enabled, the combined image
+             * sampler must be indexed only by constant integral expressions
+             * when aggregated into arrays in shader code, irrespective of
+             * the shaderSampledImageArrayDynamicIndexing feature.
+             */
+            assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
+
+            index = nir_ssa_for_src(b, deref->arr.index, 1);
+
+            if (state->add_bounds_checks)
+               index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
+         }
+      }
+   }
+
+   if (index) {
+      nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src,
+                            nir_src_for_ssa(index));
+      tex->src[deref_src_idx].src_type = offset_src_type;
+   } else {
+      nir_tex_instr_remove_src(tex, deref_src_idx);
+   }
+}
+
+static uint32_t
+tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
+{
+   int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane);
+   if (plane_src_idx < 0)
+      return 0;
+
+   unsigned plane = nir_src_as_uint(tex->src[plane_src_idx].src);
+
+   nir_tex_instr_remove_src(tex, plane_src_idx);
+
+   return plane;
+}
+
+static nir_ssa_def *
+build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx,
+                       unsigned start, unsigned end)
+{
+   if (start == end - 1) {
+      return srcs[start];
+   } else {
+      unsigned mid = start + (end - start) / 2;
+      return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)),
+                       build_def_array_select(b, srcs, idx, start, mid),
+                       build_def_array_select(b, srcs, idx, mid, end));
+   }
+}
+
+static void
+lower_gfx7_tex_swizzle(nir_builder *b, nir_tex_instr *tex, unsigned plane,
+                       struct apply_pipeline_layout_state *state)
+{
+   assert(state->pdevice->info.verx10 == 70);
+   if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ||
+       nir_tex_instr_is_query(tex) ||
+       tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */
+       (tex->is_shadow && tex->is_new_style_shadow))
+      return;
+
+   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   assert(deref_src_idx >= 0);
+
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   if ((bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) == 0)
+      return;
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   const unsigned plane_offset =
+      plane * sizeof(struct anv_texture_swizzle_descriptor);
+   nir_ssa_def *swiz =
+      build_load_var_deref_descriptor_mem(b, deref, plane_offset,
+                                          1, 32, state);
+
+   b->cursor = nir_after_instr(&tex->instr);
+
+   assert(tex->dest.ssa.bit_size == 32);
+   assert(tex->dest.ssa.num_components == 4);
+
+   /* Initializing to undef is ok; nir_opt_undef will clean it up. */
+   nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+   nir_ssa_def *comps[8];
+   for (unsigned i = 0; i < ARRAY_SIZE(comps); i++)
+      comps[i] = undef;
+
+   comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0);
+   if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float)
+      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1);
+   else
+      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1);
+   comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0);
+   comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1);
+   comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2);
+   comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3);
+
+   nir_ssa_def *swiz_comps[4];
+   for (unsigned i = 0; i < 4; i++) {
+      nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i));
+      swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8);
+   }
+   nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4);
+
+   /* Rewrite uses before we insert so we don't rewrite this use */
+   nir_ssa_def_rewrite_uses_after(&tex->dest.ssa,
+                                  swiz_tex_res,
+                                  swiz_tex_res->parent_instr);
+}
+
+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex,
+          struct apply_pipeline_layout_state *state)
+{
+   unsigned plane = tex_instr_get_and_remove_plane_src(tex);
+
+   /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader.  Do this
+    * before we lower the derefs away so we can still find the descriptor.
+    */
+   if (state->pdevice->info.verx10 == 70)
+      lower_gfx7_tex_swizzle(b, tex, plane, state);
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   lower_tex_deref(b, tex, nir_tex_src_texture_deref,
+                   &tex->texture_index, plane, state);
+
+   lower_tex_deref(b, tex, nir_tex_src_sampler_deref,
+                   &tex->sampler_index, plane, state);
+
+   return true;
+}
+
+static bool
+lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin,
+                        struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_ssa_def *rq_globals =
+      nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0),
+                             .base = offsetof(struct anv_push_constants, ray_query_globals),
+                             .range = sizeof_field(struct anv_push_constants, ray_query_globals));
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, rq_globals);
+
+   return true;
+}
+
+static bool
+apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
+{
+   struct apply_pipeline_layout_state *state = _state;
+
+   switch (instr->type) {
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_vulkan_resource_index:
+         return lower_res_index_intrinsic(b, intrin, state);
+      case nir_intrinsic_vulkan_resource_reindex:
+         return lower_res_reindex_intrinsic(b, intrin, state);
+      case nir_intrinsic_load_vulkan_descriptor:
+         return lower_load_vulkan_descriptor(b, intrin, state);
+      case nir_intrinsic_get_ssbo_size:
+         return lower_get_ssbo_size(b, intrin, state);
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_imin:
+      case nir_intrinsic_image_deref_atomic_umin:
+      case nir_intrinsic_image_deref_atomic_imax:
+      case nir_intrinsic_image_deref_atomic_umax:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_atomic_fadd:
+      case nir_intrinsic_image_deref_size:
+      case nir_intrinsic_image_deref_samples:
+      case nir_intrinsic_image_deref_load_param_intel:
+      case nir_intrinsic_image_deref_load_raw_intel:
+      case nir_intrinsic_image_deref_store_raw_intel:
+         return lower_image_intrinsic(b, intrin, state);
+      case nir_intrinsic_load_constant:
+         return lower_load_constant(b, intrin, state);
+      case nir_intrinsic_load_ray_query_global_intel:
+         return lower_ray_query_globals(b, intrin, state);
+      default:
+         return false;
+      }
+      break;
+   }
+   case nir_instr_type_tex:
+      return lower_tex(b, nir_instr_as_tex(instr), state);
+   default:
+      return false;
+   }
+}
+
+struct binding_info {
+   uint32_t binding;
+   uint8_t set;
+   uint16_t score;
+};
+
+static int
+compare_binding_infos(const void *_a, const void *_b)
+{
+   const struct binding_info *a = _a, *b = _b;
+   if (a->score != b->score)
+      return b->score - a->score;
+
+   if (a->set != b->set)
+      return a->set - b->set;
+
+   return a->binding - b->binding;
+}
+
+void
+anv_nir_apply_pipeline_layout(nir_shader *shader,
+                              const struct anv_physical_device *pdevice,
+                              bool robust_buffer_access,
+                              const struct anv_pipeline_layout *layout,
+                              struct anv_pipeline_bind_map *map)
+{
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct apply_pipeline_layout_state state = {
+      .pdevice = pdevice,
+      .layout = layout,
+      .add_bounds_checks = robust_buffer_access,
+      .desc_addr_format =
+            brw_shader_stage_requires_bindless_resources(shader->info.stage) ?
+                          nir_address_format_64bit_global_32bit_offset :
+                          nir_address_format_32bit_index_offset,
+      .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access),
+      .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_buffer_access),
+      .lowered_instrs = _mesa_pointer_set_create(mem_ctx),
+   };
+
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      const unsigned count = layout->set[s].layout->binding_count;
+      state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count);
+      state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count);
+      state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count);
+   }
+
+   nir_shader_instructions_pass(shader, get_used_bindings,
+                                nir_metadata_all, &state);
+
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (state.desc_addr_format != nir_address_format_32bit_index_offset) {
+         state.set[s].desc_offset = BINDLESS_OFFSET;
+      } else if (state.set[s].desc_buffer_used) {
+         map->surface_to_descriptor[map->surface_count] =
+            (struct anv_pipeline_binding) {
+               .set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
+               .index = s,
+            };
+         state.set[s].desc_offset = map->surface_count;
+         map->surface_count++;
+      }
+   }
+
+   if (state.uses_constants && anv_use_relocations(pdevice)) {
+      state.constants_offset = map->surface_count;
+      map->surface_to_descriptor[map->surface_count].set =
+         ANV_DESCRIPTOR_SET_SHADER_CONSTANTS;
+      map->surface_count++;
+   }
+
+   unsigned used_binding_count = 0;
+   for (uint32_t set = 0; set < layout->num_sets; set++) {
+      struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+      for (unsigned b = 0; b < set_layout->binding_count; b++) {
+         if (state.set[set].use_count[b] == 0)
+            continue;
+
+         used_binding_count++;
+      }
+   }
+
+   struct binding_info *infos =
+      rzalloc_array(mem_ctx, struct binding_info, used_binding_count);
+   used_binding_count = 0;
+   for (uint32_t set = 0; set < layout->num_sets; set++) {
+      const struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+      for (unsigned b = 0; b < set_layout->binding_count; b++) {
+         if (state.set[set].use_count[b] == 0)
+            continue;
+
+         const struct anv_descriptor_set_binding_layout *binding =
+               &layout->set[set].layout->binding[b];
+
+         /* Do a fixed-point calculation to generate a score based on the
+          * number of uses and the binding array size.  We shift by 7 instead
+          * of 8 because we're going to use the top bit below to make
+          * everything which does not support bindless super higher priority
+          * than things which do.
+          */
+         uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) /
+                          binding->array_size;
+
+         /* If the descriptor type doesn't support bindless then put it at the
+          * beginning so we guarantee it gets a slot.
+          */
+         if (!anv_descriptor_supports_bindless(pdevice, binding, true) ||
+             !anv_descriptor_supports_bindless(pdevice, binding, false))
+            score |= 1 << 15;
+
+         infos[used_binding_count++] = (struct binding_info) {
+            .set = set,
+            .binding = b,
+            .score = score,
+         };
+      }
+   }
+
+   /* Order the binding infos based on score with highest scores first.  If
+    * scores are equal we then order by set and binding.
+    */
+   qsort(infos, used_binding_count, sizeof(struct binding_info),
+         compare_binding_infos);
+
+   for (unsigned i = 0; i < used_binding_count; i++) {
+      unsigned set = infos[i].set, b = infos[i].binding;
+      const struct anv_descriptor_set_binding_layout *binding =
+            &layout->set[set].layout->binding[b];
+
+      const uint32_t array_size = binding->array_size;
+
+      if (binding->dynamic_offset_index >= 0)
+         state.has_dynamic_buffers = true;
+
+      if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) {
+         if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE ||
+             anv_descriptor_requires_bindless(pdevice, binding, false) ||
+             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
+            /* If this descriptor doesn't fit in the binding table or if it
+             * requires bindless for some reason, flag it as bindless.
+             */
+            assert(anv_descriptor_supports_bindless(pdevice, binding, false));
+            state.set[set].surface_offsets[b] = BINDLESS_OFFSET;
+         } else {
+            state.set[set].surface_offsets[b] = map->surface_count;
+            if (binding->dynamic_offset_index < 0) {
+               struct anv_sampler **samplers = binding->immutable_samplers;
+               for (unsigned i = 0; i < binding->array_size; i++) {
+                  uint8_t planes = samplers ? samplers[i]->n_planes : 1;
+                  for (uint8_t p = 0; p < planes; p++) {
+                     map->surface_to_descriptor[map->surface_count++] =
+                        (struct anv_pipeline_binding) {
+                           .set = set,
+                           .index = binding->descriptor_index + i,
+                           .plane = p,
+                        };
+                  }
+               }
+            } else {
+               for (unsigned i = 0; i < binding->array_size; i++) {
+                  map->surface_to_descriptor[map->surface_count++] =
+                     (struct anv_pipeline_binding) {
+                        .set = set,
+                        .index = binding->descriptor_index + i,
+                        .dynamic_offset_index =
+                           layout->set[set].dynamic_offset_start +
+                           binding->dynamic_offset_index + i,
+                     };
+               }
+            }
+         }
+         assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+      }
+
+      if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) {
+         if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE ||
+             anv_descriptor_requires_bindless(pdevice, binding, true) ||
+             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
+            /* If this descriptor doesn't fit in the binding table or if it
+             * requires bindless for some reason, flag it as bindless.
+             *
+             * We also make large sampler arrays bindless because we can avoid
+             * using indirect sends thanks to bindless samplers being packed
+             * less tightly than the sampler table.
+             */
+            assert(anv_descriptor_supports_bindless(pdevice, binding, true));
+            state.set[set].sampler_offsets[b] = BINDLESS_OFFSET;
+         } else {
+            state.set[set].sampler_offsets[b] = map->sampler_count;
+            struct anv_sampler **samplers = binding->immutable_samplers;
+            for (unsigned i = 0; i < binding->array_size; i++) {
+               uint8_t planes = samplers ? samplers[i]->n_planes : 1;
+               for (uint8_t p = 0; p < planes; p++) {
+                  map->sampler_to_descriptor[map->sampler_count++] =
+                     (struct anv_pipeline_binding) {
+                        .set = set,
+                        .index = binding->descriptor_index + i,
+                        .plane = p,
+                     };
+               }
+            }
+         }
+      }
+   }
+
+   nir_foreach_image_variable(var, shader) {
+      const uint32_t set = var->data.descriptor_set;
+      const uint32_t binding = var->data.binding;
+      const struct anv_descriptor_set_binding_layout *bind_layout =
+            &layout->set[set].layout->binding[binding];
+      const uint32_t array_size = bind_layout->array_size;
+
+      if (state.set[set].use_count[binding] == 0)
+         continue;
+
+      if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE)
+         continue;
+
+      struct anv_pipeline_binding *pipe_binding =
+         &map->surface_to_descriptor[state.set[set].surface_offsets[binding]];
+      for (unsigned i = 0; i < array_size; i++) {
+         assert(pipe_binding[i].set == set);
+         assert(pipe_binding[i].index == bind_layout->descriptor_index + i);
+
+         pipe_binding[i].lowered_storage_surface =
+            image_binding_needs_lowered_surface(var);
+      }
+   }
+
+   /* Before we do the normal lowering, we look for any SSBO operations
+    * that we can lower to the BTI model and lower them up-front.  The BTI
+    * model can perform better than the A64 model for a couple reasons:
+    *
+    *  1. 48-bit address calculations are potentially expensive and using
+    *     the BTI model lets us simply compute 32-bit offsets and the
+    *     hardware adds the 64-bit surface base address.
+    *
+    *  2. The BTI messages, because they use surface states, do bounds
+    *     checking for us.  With the A64 model, we have to do our own
+    *     bounds checking and this means wider pointers and extra
+    *     calculations and branching in the shader.
+    *
+    * The solution to both of these is to convert things to the BTI model
+    * opportunistically.  The reason why we need to do this as a pre-pass
+    * is for two reasons:
+    *
+    *  1. The BTI model requires nir_address_format_32bit_index_offset
+    *     pointers which are not the same type as the pointers needed for
+    *     the A64 model.  Because all our derefs are set up for the A64
+    *     model (in case we have variable pointers), we have to crawl all
+    *     the way back to the vulkan_resource_index intrinsic and build a
+    *     completely fresh index+offset calculation.
+    *
+    *  2. Because the variable-pointers-capable lowering that we do as part
+    *     of apply_pipeline_layout_block is destructive (It really has to
+    *     be to handle variable pointers properly), we've lost the deref
+    *     information by the time we get to the load/store/atomic
+    *     intrinsics in that pass.
+    */
+   nir_shader_instructions_pass(shader, lower_direct_buffer_instr,
+                                nir_metadata_block_index |
+                                nir_metadata_dominance,
+                                &state);
+
+   /* We just got rid of all the direct access.  Delete it so it's not in the
+    * way when we do our indirect lowering.
+    */
+   nir_opt_dce(shader);
+
+   nir_shader_instructions_pass(shader, apply_pipeline_layout,
+                                nir_metadata_block_index |
+                                nir_metadata_dominance,
+                                &state);
+
+   ralloc_free(mem_ctx);
+
+   if (brw_shader_stage_is_bindless(shader->info.stage)) {
+      assert(map->surface_count == 0);
+      assert(map->sampler_count == 0);
+   }
+
+   /* Now that we're done computing the surface and sampler portions of the
+    * bind map, hash them.  This lets us quickly determine if the actual
+    * mapping has changed and not just a no-op pipeline change.
+    */
+   _mesa_sha1_compute(map->surface_to_descriptor,
+                      map->surface_count * sizeof(struct anv_pipeline_binding),
+                      map->surface_sha1);
+   _mesa_sha1_compute(map->sampler_to_descriptor,
+                      map->sampler_count * sizeof(struct anv_pipeline_binding),
+                      map->sampler_sha1);
+}
diff --git a/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c b/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c
new file mode 100644
index 00000000000..2385c5aea20
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_compute_push_layout.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+#include "compiler/brw_nir.h"
+#include "util/mesa-sha1.h"
+
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+void
+anv_nir_compute_push_layout(nir_shader *nir,
+                            const struct anv_physical_device *pdevice,
+                            bool robust_buffer_access,
+                            struct brw_stage_prog_data *prog_data,
+                            struct anv_pipeline_bind_map *map,
+                            void *mem_ctx)
+{
+   const struct brw_compiler *compiler = pdevice->compiler;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+   memset(map->push_ranges, 0, sizeof(map->push_ranges));
+
+   bool has_const_ubo = false;
+   unsigned push_start = UINT_MAX, push_end = 0;
+   nir_foreach_function(function, nir) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_load_ubo:
+               if (nir_src_is_const(intrin->src[0]) &&
+                   nir_src_is_const(intrin->src[1]))
+                  has_const_ubo = true;
+               break;
+
+            case nir_intrinsic_load_push_constant: {
+               unsigned base = nir_intrinsic_base(intrin);
+               unsigned range = nir_intrinsic_range(intrin);
+               push_start = MIN2(push_start, base);
+               push_end = MAX2(push_end, base + range);
+               break;
+            }
+
+            case nir_intrinsic_load_desc_set_address_intel:
+               push_start = MIN2(push_start,
+                  offsetof(struct anv_push_constants, desc_sets));
+               push_end = MAX2(push_end, push_start +
+                  sizeof_field(struct anv_push_constants, desc_sets));
+               break;
+
+            default:
+               break;
+            }
+         }
+      }
+   }
+
+   const bool has_push_intrinsic = push_start <= push_end;
+
+   const bool push_ubo_ranges =
+      pdevice->info.verx10 >= 75 &&
+      has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE &&
+      !brw_shader_stage_requires_bindless_resources(nir->info.stage);
+
+   if (push_ubo_ranges && robust_buffer_access) {
+      /* We can't on-the-fly adjust our push ranges because doing so would
+       * mess up the layout in the shader.  When robustBufferAccess is
+       * enabled, we push a mask into the shader indicating which pushed
+       * registers are valid and we zero out the invalid ones at the top of
+       * the shader.
+       */
+      const uint32_t push_reg_mask_start =
+         offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
+      const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t);
+      push_start = MIN2(push_start, push_reg_mask_start);
+      push_end = MAX2(push_end, push_reg_mask_end);
+   }
+
+   if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) {
+      /* For compute shaders, we always have to have the subgroup ID.  The
+       * back-end compiler will "helpfully" add it for us in the last push
+       * constant slot.  Yes, there is an off-by-one error here but that's
+       * because the back-end will add it so we want to claim the number of
+       * push constants one dword less than the full amount including
+       * gl_SubgroupId.
+       */
+      assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id));
+      push_end = offsetof(struct anv_push_constants, cs.subgroup_id);
+   }
+
+   /* Align push_start down to a 32B boundary and make it no larger than
+    * push_end (no push constants is indicated by push_start = UINT_MAX).
+    */
+   push_start = MIN2(push_start, push_end);
+   push_start = align_down_u32(push_start, 32);
+
+   /* For vec4 our push data size needs to be aligned to a vec4 and for
+    * scalar, it needs to be aligned to a DWORD.
+    */
+   const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
+   nir->num_uniforms = ALIGN(push_end - push_start, align);
+   prog_data->nr_params = nir->num_uniforms / 4;
+   prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
+
+   struct anv_push_range push_constant_range = {
+      .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
+      .start = push_start / 32,
+      .length = DIV_ROUND_UP(push_end - push_start, 32),
+   };
+
+   if (has_push_intrinsic) {
+      nir_foreach_function(function, nir) {
+         if (!function->impl)
+            continue;
+
+         nir_builder build, *b = &build;
+         nir_builder_init(b, function->impl);
+
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_push_constant: {
+                  /* With bindless shaders we load uniforms with SEND
+                   * messages. All the push constants are located after the
+                   * RT_DISPATCH_GLOBALS. We just need to add the offset to
+                   * the address right after RT_DISPATCH_GLOBALS (see
+                   * brw_nir_lower_rt_intrinsics.c).
+                   */
+                  unsigned base_offset =
+                     brw_shader_stage_requires_bindless_resources(nir->info.stage) ? 0 : push_start;
+                  intrin->intrinsic = nir_intrinsic_load_uniform;
+                  nir_intrinsic_set_base(intrin,
+                                         nir_intrinsic_base(intrin) -
+                                         base_offset);
+                  break;
+               }
+
+               case nir_intrinsic_load_desc_set_address_intel: {
+                  b->cursor = nir_before_instr(&intrin->instr);
+                  nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64,
+                     nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)),
+                     .base = offsetof(struct anv_push_constants, desc_sets),
+                     .range = sizeof_field(struct anv_push_constants, desc_sets),
+                     .dest_type = nir_type_uint64);
+                  nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load);
+                  break;
+               }
+
+               default:
+                  break;
+               }
+            }
+         }
+      }
+   }
+
+   if (push_ubo_ranges) {
+      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+      /* The vec4 back-end pushes at most 32 regs while the scalar back-end
+       * pushes up to 64.  This is primarily because the scalar back-end has a
+       * massively more competent register allocator and so the risk of
+       * spilling due to UBO pushing isn't nearly as high.
+       */
+      const unsigned max_push_regs =
+         compiler->scalar_stage[nir->info.stage] ? 64 : 32;
+
+      unsigned total_push_regs = push_constant_range.length;
+      for (unsigned i = 0; i < 4; i++) {
+         if (total_push_regs + prog_data->ubo_ranges[i].length > max_push_regs)
+            prog_data->ubo_ranges[i].length = max_push_regs - total_push_regs;
+         total_push_regs += prog_data->ubo_ranges[i].length;
+      }
+      assert(total_push_regs <= max_push_regs);
+
+      int n = 0;
+
+      if (push_constant_range.length > 0)
+         map->push_ranges[n++] = push_constant_range;
+
+      if (robust_buffer_access) {
+         const uint32_t push_reg_mask_offset =
+            offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
+         assert(push_reg_mask_offset >= push_start);
+         prog_data->push_reg_mask_param =
+            (push_reg_mask_offset - push_start) / 4;
+      }
+
+      unsigned range_start_reg = push_constant_range.length;
+
+      for (int i = 0; i < 4; i++) {
+         struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
+         if (ubo_range->length == 0)
+            continue;
+
+         if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) {
+            memset(ubo_range, 0, sizeof(*ubo_range));
+            continue;
+         }
+
+         const struct anv_pipeline_binding *binding =
+            &map->surface_to_descriptor[ubo_range->block];
+
+         map->push_ranges[n++] = (struct anv_push_range) {
+            .set = binding->set,
+            .index = binding->index,
+            .dynamic_offset_index = binding->dynamic_offset_index,
+            .start = ubo_range->start,
+            .length = ubo_range->length,
+         };
+
+         /* We only bother to shader-zero pushed client UBOs */
+         if (binding->set < MAX_SETS && robust_buffer_access) {
+            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
+                                                         ubo_range->length);
+         }
+
+         range_start_reg += ubo_range->length;
+      }
+   } else {
+      /* For Ivy Bridge, the push constants packets have a different
+       * rule that would require us to iterate in the other direction
+       * and possibly mess around with dynamic state base address.
+       * Don't bother; just emit regular push constants at n = 0.
+       *
+       * In the compute case, we don't have multiple push ranges so it's
+       * better to just provide one in push_ranges[0].
+       */
+      map->push_ranges[0] = push_constant_range;
+   }
+
+   /* Now that we're done computing the push constant portion of the
+    * bind map, hash it.  This lets us quickly determine if the actual
+    * mapping has changed and not just a no-op pipeline change.
+    */
+   _mesa_sha1_compute(map->push_ranges,
+                      sizeof(map->push_ranges),
+                      map->push_sha1);
+}
+
+void
+anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
+                             struct anv_pipeline_bind_map *map)
+{
+#ifndef NDEBUG
+   unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8);
+   for (unsigned i = 0; i < 4; i++)
+      prog_data_push_size += prog_data->ubo_ranges[i].length;
+
+   unsigned bind_map_push_size = 0;
+   for (unsigned i = 0; i < 4; i++)
+      bind_map_push_size += map->push_ranges[i].length;
+
+   /* We could go through everything again but it should be enough to assert
+    * that they push the same number of registers.  This should alert us if
+    * the back-end compiler decides to re-arrange stuff or shrink a range.
+    */
+   assert(prog_data_push_size == bind_map_push_size);
+#endif
+}
diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c b/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c
new file mode 100644
index 00000000000..dd591976ac4
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_lower_multiview.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir/nir_builder.h"
+#include "util/debug.h"
+
+/**
+ * This file implements the lowering required for VK_KHR_multiview.
+ *
+ * When possible, Primitive Replication is used and the shader is modified to
+ * make gl_Position an array and fill it with values for each view.
+ *
+ * Otherwise we implement multiview using instanced rendering.  The number of
+ * instances in each draw call is multiplied by the number of views in the
+ * subpass.  Then, in the shader, we divide gl_InstanceId by the number of
+ * views and use gl_InstanceId % view_count to compute the actual ViewIndex.
+ */
+
+struct lower_multiview_state {
+   nir_builder builder;
+
+   uint32_t view_mask;
+
+   nir_ssa_def *instance_id;
+   nir_ssa_def *view_index;
+};
+
+static nir_ssa_def *
+build_instance_id(struct lower_multiview_state *state)
+{
+   assert(state->builder.shader->info.stage == MESA_SHADER_VERTEX);
+
+   if (state->instance_id == NULL) {
+      nir_builder *b = &state->builder;
+
+      b->cursor = nir_before_block(nir_start_block(b->impl));
+
+      /* We use instancing for implementing multiview.  The actual instance id
+       * is given by dividing instance_id by the number of views in this
+       * subpass.
+       */
+      state->instance_id =
+         nir_idiv(b, nir_load_instance_id(b),
+                     nir_imm_int(b, util_bitcount(state->view_mask)));
+   }
+
+   return state->instance_id;
+}
+
+static nir_ssa_def *
+build_view_index(struct lower_multiview_state *state)
+{
+   assert(state->builder.shader->info.stage != MESA_SHADER_FRAGMENT);
+
+   if (state->view_index == NULL) {
+      nir_builder *b = &state->builder;
+
+      b->cursor = nir_before_block(nir_start_block(b->impl));
+
+      assert(state->view_mask != 0);
+      if (util_bitcount(state->view_mask) == 1) {
+         /* Set the view index directly. */
+         state->view_index = nir_imm_int(b, ffs(state->view_mask) - 1);
+      } else if (state->builder.shader->info.stage == MESA_SHADER_VERTEX) {
+         /* We only support 16 viewports */
+         assert((state->view_mask & 0xffff0000) == 0);
+
+         /* We use instancing for implementing multiview.  The compacted view
+          * id is given by instance_id % view_count.  We then have to convert
+          * that to an actual view id.
+          */
+         nir_ssa_def *compacted =
+            nir_umod(b, nir_load_instance_id(b),
+                        nir_imm_int(b, util_bitcount(state->view_mask)));
+
+         if (util_is_power_of_two_or_zero(state->view_mask + 1)) {
+            /* If we have a full view mask, then compacted is what we want */
+            state->view_index = compacted;
+         } else {
+            /* Now we define a map from compacted view index to the actual
+             * view index that's based on the view_mask.  The map is given by
+             * 16 nibbles, each of which is a value from 0 to 15.
+             */
+            uint64_t remap = 0;
+            uint32_t i = 0;
+            u_foreach_bit(bit, state->view_mask) {
+               assert(bit < 16);
+               remap |= (uint64_t)bit << (i++ * 4);
+            }
+
+            nir_ssa_def *shift = nir_imul(b, compacted, nir_imm_int(b, 4));
+
+            /* One of these days, when we have int64 everywhere, this will be
+             * easier.
+             */
+            nir_ssa_def *shifted;
+            if (remap <= UINT32_MAX) {
+               shifted = nir_ushr(b, nir_imm_int(b, remap), shift);
+            } else {
+               nir_ssa_def *shifted_low =
+                  nir_ushr(b, nir_imm_int(b, remap), shift);
+               nir_ssa_def *shifted_high =
+                  nir_ushr(b, nir_imm_int(b, remap >> 32),
+                              nir_isub(b, shift, nir_imm_int(b, 32)));
+               shifted = nir_bcsel(b, nir_ilt(b, shift, nir_imm_int(b, 32)),
+                                      shifted_low, shifted_high);
+            }
+            state->view_index = nir_iand(b, shifted, nir_imm_int(b, 0xf));
+         }
+      } else {
+         const struct glsl_type *type = glsl_int_type();
+         if (b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
+             b->shader->info.stage == MESA_SHADER_GEOMETRY)
+            type = glsl_array_type(type, 1, 0);
+
+         nir_variable *idx_var =
+            nir_variable_create(b->shader, nir_var_shader_in,
+                                type, "view index");
+         idx_var->data.location = VARYING_SLOT_VIEW_INDEX;
+         if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
+            idx_var->data.interpolation = INTERP_MODE_FLAT;
+
+         nir_deref_instr *deref = nir_build_deref_var(b, idx_var);
+         if (glsl_type_is_array(type))
+            deref = nir_build_deref_array_imm(b, deref, 0);
+
+         state->view_index = nir_load_deref(b, deref);
+      }
+   }
+
+   return state->view_index;
+}
+
+static bool
+is_load_view_index(const nir_instr *instr, const void *data)
+{
+   return instr->type == nir_instr_type_intrinsic &&
+          nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_view_index;
+}
+
+static nir_ssa_def *
+replace_load_view_index_with_zero(struct nir_builder *b,
+                                  nir_instr *instr, void *data)
+{
+   assert(is_load_view_index(instr, data));
+   return nir_imm_zero(b, 1, 32);
+}
+
+static nir_ssa_def *
+replace_load_view_index_with_layer_id(struct nir_builder *b,
+                                      nir_instr *instr, void *data)
+{
+   assert(is_load_view_index(instr, data));
+   return nir_load_layer_id(b);
+}
+
+bool
+anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+                        bool use_primitive_replication)
+{
+   assert(shader->info.stage != MESA_SHADER_COMPUTE);
+
+   /* If multiview isn't enabled, just lower the ViewIndex builtin to zero. */
+   if (view_mask == 0) {
+      return nir_shader_lower_instructions(shader, is_load_view_index,
+                                           replace_load_view_index_with_zero, NULL);
+   }
+
+   if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+      return nir_shader_lower_instructions(shader, is_load_view_index,
+                                           replace_load_view_index_with_layer_id, NULL);
+   }
+
+   /* This pass assumes a single entrypoint */
+   nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
+
+   /* Primitive Replication allows a shader to write different positions for
+    * each view in the same execution. If only the position depends on the
+    * view, then it is possible to use the feature instead of instancing to
+    * implement multiview.
+    */
+   if (use_primitive_replication) {
+      bool progress = nir_lower_multiview(shader, view_mask);
+
+      if (progress) {
+         nir_builder b;
+         nir_builder_init(&b, entrypoint);
+         b.cursor = nir_before_cf_list(&entrypoint->body);
+
+         /* Fill Layer ID with zero. Replication will use that as base to
+          * apply the RTAI offsets.
+          */
+         nir_variable *layer_id_out =
+            nir_variable_create(shader, nir_var_shader_out,
+                                glsl_int_type(), "layer ID");
+         layer_id_out->data.location = VARYING_SLOT_LAYER;
+         nir_store_var(&b, layer_id_out, nir_imm_zero(&b, 1, 32), 0x1);
+      }
+
+      return progress;
+   }
+
+   struct lower_multiview_state state = {
+      .view_mask = view_mask,
+   };
+
+   nir_builder_init(&state.builder, entrypoint);
+
+   nir_foreach_block(block, entrypoint) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+         if (load->intrinsic != nir_intrinsic_load_instance_id &&
+             load->intrinsic != nir_intrinsic_load_view_index)
+            continue;
+
+         assert(load->dest.is_ssa);
+
+         nir_ssa_def *value;
+         if (load->intrinsic == nir_intrinsic_load_instance_id) {
+            value = build_instance_id(&state);
+         } else {
+            assert(load->intrinsic == nir_intrinsic_load_view_index);
+            value = build_view_index(&state);
+         }
+
+         nir_ssa_def_rewrite_uses(&load->dest.ssa, value);
+
+         nir_instr_remove(&load->instr);
+      }
+   }
+
+   /* The view index is available in all stages but the instance id is only
+    * available in the VS.  If it's not a fragment shader, we need to pass
+    * the view index on to the next stage.
+    */
+   nir_ssa_def *view_index = build_view_index(&state);
+
+   nir_builder *b = &state.builder;
+
+   assert(view_index->parent_instr->block == nir_start_block(entrypoint));
+   b->cursor = nir_after_instr(view_index->parent_instr);
+
+   /* Unless there is only one possible view index (that would be set
+    * directly), pass it to the next stage. */
+   if (util_bitcount(state.view_mask) != 1) {
+      nir_variable *view_index_out =
+         nir_variable_create(shader, nir_var_shader_out,
+                             glsl_int_type(), "view index");
+      view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+      nir_store_var(b, view_index_out, view_index, 0x1);
+   }
+
+   nir_variable *layer_id_out =
+      nir_variable_create(shader, nir_var_shader_out,
+                          glsl_int_type(), "layer ID");
+   layer_id_out->data.location = VARYING_SLOT_LAYER;
+   nir_store_var(b, layer_id_out, view_index, 0x1);
+
+   nir_metadata_preserve(entrypoint, nir_metadata_block_index |
+                                     nir_metadata_dominance);
+
+   return true;
+}
+
+bool
+anv_check_for_primitive_replication(struct anv_device *device,
+                                    VkShaderStageFlags stages,
+                                    nir_shader **shaders,
+                                    uint32_t view_mask)
+{
+   assert(device->info->ver >= 12);
+
+   static int primitive_replication_max_views = -1;
+   if (primitive_replication_max_views < 0) {
+      /* TODO: Figure out why we are not getting same benefits for larger than
+       * 2 views.  For now use Primitive Replication just for the 2-view case
+       * by default.
+       */
+      const unsigned default_max_views = 2;
+
+      primitive_replication_max_views =
+         MIN2(MAX_VIEWS_FOR_PRIMITIVE_REPLICATION,
+              env_var_as_unsigned("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS",
+                                  default_max_views));
+   }
+
+   /* TODO: We should be able to support replication at 'geometry' stages
+    * later than Vertex.  In that case only the last stage can refer to
+    * gl_ViewIndex.
+    */
+   if (stages & ~(VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT))
+      return false;
+
+   int view_count = util_bitcount(view_mask);
+   if (view_count == 1 || view_count > primitive_replication_max_views)
+      return false;
+
+   return nir_can_lower_multiview(shaders[MESA_SHADER_VERTEX]);
+}
diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c
new file mode 100644
index 00000000000..5a170352c80
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_lower_ubo_loads.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+
+static bool
+lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+   if (load->intrinsic != nir_intrinsic_load_global_constant_offset &&
+       load->intrinsic != nir_intrinsic_load_global_constant_bounded)
+      return false;
+
+   b->cursor = nir_before_instr(instr);
+
+   nir_ssa_def *base_addr = load->src[0].ssa;
+   nir_ssa_def *bound = NULL;
+   if (load->intrinsic == nir_intrinsic_load_global_constant_bounded)
+      bound = load->src[2].ssa;
+
+   unsigned bit_size = load->dest.ssa.bit_size;
+   assert(bit_size >= 8 && bit_size % 8 == 0);
+   unsigned byte_size = bit_size / 8;
+
+   nir_ssa_def *val;
+   if (nir_src_is_const(load->src[1])) {
+      uint32_t offset = nir_src_as_uint(load->src[1]);
+
+      /* Things should be component-aligned. */
+      assert(offset % byte_size == 0);
+
+      assert(ANV_UBO_ALIGNMENT == 64);
+
+      unsigned suboffset = offset % 64;
+      uint64_t aligned_offset = offset - suboffset;
+
+      /* Load two just in case we go over a 64B boundary */
+      nir_ssa_def *data[2];
+      for (unsigned i = 0; i < 2; i++) {
+         nir_ssa_def *pred;
+         if (bound) {
+            pred = nir_ilt(b, nir_imm_int(b, aligned_offset + i * 64 + 63),
+                              bound);
+         } else {
+            pred = nir_imm_true(b);
+         }
+
+         nir_ssa_def *addr = nir_iadd_imm(b, base_addr,
+                                          aligned_offset + i * 64);
+
+         data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
+      }
+
+      val = nir_extract_bits(b, data, 2, suboffset * 8,
+                             load->num_components, bit_size);
+   } else {
+      nir_ssa_def *offset = load->src[1].ssa;
+      nir_ssa_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset));
+
+      if (bound) {
+         nir_ssa_def *zero = nir_imm_zero(b, load->num_components, bit_size);
+
+         unsigned load_size = byte_size * load->num_components;
+         nir_ssa_def *in_bounds =
+            nir_ilt(b, nir_iadd_imm(b, offset, load_size - 1), bound);
+
+         nir_push_if(b, in_bounds);
+
+         nir_ssa_def *load_val =
+            nir_build_load_global_constant(b, load->dest.ssa.num_components,
+                                           load->dest.ssa.bit_size, addr,
+                                           .access = nir_intrinsic_access(load),
+                                           .align_mul = nir_intrinsic_align_mul(load),
+                                           .align_offset = nir_intrinsic_align_offset(load));
+
+         nir_pop_if(b, NULL);
+
+         val = nir_if_phi(b, load_val, zero);
+      } else {
+         val = nir_build_load_global_constant(b, load->dest.ssa.num_components,
+                                              load->dest.ssa.bit_size, addr,
+                                              .access = nir_intrinsic_access(load),
+                                              .align_mul = nir_intrinsic_align_mul(load),
+                                              .align_offset = nir_intrinsic_align_offset(load));
+      }
+   }
+
+   nir_ssa_def_rewrite_uses(&load->dest.ssa, val);
+   nir_instr_remove(&load->instr);
+
+   return true;
+}
+
+bool
+anv_nir_lower_ubo_loads(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, lower_ubo_load_instr,
+                                       nir_metadata_none,
+                                       NULL);
+}
diff --git a/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c
new file mode 100644
index 00000000000..e82cd032e20
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_nir_lower_ycbcr_textures.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "anv_private.h"
+#include "nir/nir.h"
+#include "nir/nir_builder.h"
+#include "nir/nir_vulkan.h"
+
+struct ycbcr_state {
+   nir_builder *builder;
+   nir_ssa_def *image_size;
+   nir_tex_instr *origin_tex;
+   nir_deref_instr *tex_deref;
+   struct anv_ycbcr_conversion *conversion;
+};
+
+/* TODO: we should probably replace this with a push constant/uniform. */
+static nir_ssa_def *
+get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
+{
+   if (state->image_size)
+      return state->image_size;
+
+   nir_builder *b = state->builder;
+   const struct glsl_type *type = texture->type;
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+
+   tex->op = nir_texop_txs;
+   tex->sampler_dim = glsl_get_sampler_dim(type);
+   tex->is_array = glsl_sampler_type_is_array(type);
+   tex->is_shadow = glsl_sampler_type_is_shadow(type);
+   tex->dest_type = nir_type_int32;
+
+   tex->src[0].src_type = nir_tex_src_texture_deref;
+   tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
+
+   nir_ssa_dest_init(&tex->instr, &tex->dest,
+                     nir_tex_instr_dest_size(tex), 32, NULL);
+   nir_builder_instr_insert(b, &tex->instr);
+
+   state->image_size = nir_i2f32(b, &tex->dest.ssa);
+
+   return state->image_size;
+}
+
+static nir_ssa_def *
+implicit_downsampled_coord(nir_builder *b,
+                           nir_ssa_def *value,
+                           nir_ssa_def *max_value,
+                           int div_scale)
+{
+   return nir_fadd(b,
+                   value,
+                   nir_fdiv(b,
+                            nir_imm_float(b, 1.0f),
+                            nir_fmul(b,
+                                     nir_imm_float(b, div_scale),
+                                     max_value)));
+}
+
+static nir_ssa_def *
+implicit_downsampled_coords(struct ycbcr_state *state,
+                            nir_ssa_def *old_coords,
+                            const struct anv_format_plane *plane_format)
+{
+   nir_builder *b = state->builder;
+   struct anv_ycbcr_conversion *conversion = state->conversion;
+   nir_ssa_def *image_size = get_texture_size(state, state->tex_deref);
+   nir_ssa_def *comp[4] = { NULL, };
+   int c;
+
+   for (c = 0; c < ARRAY_SIZE(conversion->chroma_offsets); c++) {
+      if (plane_format->denominator_scales[c] > 1 &&
+          conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
+         comp[c] = implicit_downsampled_coord(b,
+                                              nir_channel(b, old_coords, c),
+                                              nir_channel(b, image_size, c),
+                                              plane_format->denominator_scales[c]);
+      } else {
+         comp[c] = nir_channel(b, old_coords, c);
+      }
+   }
+
+   /* Leave other coordinates untouched */
+   for (; c < old_coords->num_components; c++)
+      comp[c] = nir_channel(b, old_coords, c);
+
+   return nir_vec(b, comp, old_coords->num_components);
+}
+
+static nir_ssa_def *
+create_plane_tex_instr_implicit(struct ycbcr_state *state,
+                                uint32_t plane)
+{
+   nir_builder *b = state->builder;
+   struct anv_ycbcr_conversion *conversion = state->conversion;
+   const struct anv_format_plane *plane_format =
+      &conversion->format->planes[plane];
+   nir_tex_instr *old_tex = state->origin_tex;
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs + 1);
+
+   for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
+      tex->src[i].src_type = old_tex->src[i].src_type;
+
+      switch (old_tex->src[i].src_type) {
+      case nir_tex_src_coord:
+         if (plane_format->has_chroma && conversion->chroma_reconstruction) {
+            assert(old_tex->src[i].src.is_ssa);
+            tex->src[i].src =
+               nir_src_for_ssa(implicit_downsampled_coords(state,
+                                                           old_tex->src[i].src.ssa,
+                                                           plane_format));
+            break;
+         }
+         FALLTHROUGH;
+      default:
+         nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, &tex->instr);
+         break;
+      }
+   }
+   tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
+   tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane;
+
+   tex->sampler_dim = old_tex->sampler_dim;
+   tex->dest_type = old_tex->dest_type;
+
+   tex->op = old_tex->op;
+   tex->coord_components = old_tex->coord_components;
+   tex->is_new_style_shadow = old_tex->is_new_style_shadow;
+   tex->component = old_tex->component;
+
+   tex->texture_index = old_tex->texture_index;
+   tex->sampler_index = old_tex->sampler_index;
+   tex->is_array = old_tex->is_array;
+
+   nir_ssa_dest_init(&tex->instr, &tex->dest,
+                     old_tex->dest.ssa.num_components,
+                     nir_dest_bit_size(old_tex->dest), NULL);
+   nir_builder_instr_insert(b, &tex->instr);
+
+   return &tex->dest.ssa;
+}
+
+static unsigned
+channel_to_component(enum isl_channel_select channel)
+{
+   switch (channel) {
+   case ISL_CHANNEL_SELECT_RED:
+      return 0;
+   case ISL_CHANNEL_SELECT_GREEN:
+      return 1;
+   case ISL_CHANNEL_SELECT_BLUE:
+      return 2;
+   case ISL_CHANNEL_SELECT_ALPHA:
+      return 3;
+   default:
+      unreachable("invalid channel");
+      return 0;
+   }
+}
+
+static enum isl_channel_select
+swizzle_channel(struct isl_swizzle swizzle, unsigned channel)
+{
+   switch (channel) {
+   case 0:
+      return swizzle.r;
+   case 1:
+      return swizzle.g;
+   case 2:
+      return swizzle.b;
+   case 3:
+      return swizzle.a;
+   default:
+      unreachable("invalid channel");
+      return 0;
+   }
+}
+
+static bool
+anv_nir_lower_ycbcr_textures_instr(nir_builder *builder,
+                                   nir_instr *instr,
+                                   void *cb_data)
+{
+   const struct anv_pipeline_layout *layout = cb_data;
+
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   assert(deref_src_idx >= 0);
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   const struct anv_descriptor_set_layout *set_layout =
+      layout->set[var->data.descriptor_set].layout;
+   const struct anv_descriptor_set_binding_layout *binding =
+      &set_layout->binding[var->data.binding];
+
+   /* For the following instructions, we don't apply any change and let the
+    * instruction apply to the first plane.
+    */
+   if (tex->op == nir_texop_txs ||
+       tex->op == nir_texop_query_levels ||
+       tex->op == nir_texop_lod)
+      return false;
+
+   if (binding->immutable_samplers == NULL)
+      return false;
+
+   assert(tex->texture_index == 0);
+   unsigned array_index = 0;
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+      if (!nir_src_is_const(deref->arr.index))
+         return false;
+      array_index = nir_src_as_uint(deref->arr.index);
+      array_index = MIN2(array_index, binding->array_size - 1);
+   }
+   const struct anv_sampler *sampler = binding->immutable_samplers[array_index];
+
+   if (sampler->conversion == NULL)
+      return false;
+
+   struct ycbcr_state state = {
+      .builder = builder,
+      .origin_tex = tex,
+      .tex_deref = deref,
+      .conversion = sampler->conversion,
+   };
+
+   builder->cursor = nir_before_instr(&tex->instr);
+
+   const struct anv_format *format = state.conversion->format;
+   const struct isl_format_layout *y_isl_layout = NULL;
+   for (uint32_t p = 0; p < format->n_planes; p++) {
+      if (!format->planes[p].has_chroma)
+         y_isl_layout = isl_format_get_layout(format->planes[p].isl_format);
+   }
+   assert(y_isl_layout != NULL);
+   uint8_t y_bpc = y_isl_layout->channels_array[0].bits;
+
+   /* |ycbcr_comp| holds components in the order : Cr-Y-Cb */
+   nir_ssa_def *zero = nir_imm_float(builder, 0.0f);
+   nir_ssa_def *one = nir_imm_float(builder, 1.0f);
+   /* Use extra 2 channels for following swizzle */
+   nir_ssa_def *ycbcr_comp[5] = { zero, zero, zero, one, zero };
+
+   uint8_t ycbcr_bpcs[5];
+   memset(ycbcr_bpcs, y_bpc, sizeof(ycbcr_bpcs));
+
+   /* Go through all the planes and gather the samples into a |ycbcr_comp|
+    * while applying a swizzle required by the spec:
+    *
+    *    R, G, B should respectively map to Cr, Y, Cb
+    */
+   for (uint32_t p = 0; p < format->n_planes; p++) {
+      const struct anv_format_plane *plane_format = &format->planes[p];
+      nir_ssa_def *plane_sample = create_plane_tex_instr_implicit(&state, p);
+
+      for (uint32_t pc = 0; pc < 4; pc++) {
+         enum isl_channel_select ycbcr_swizzle =
+            swizzle_channel(plane_format->ycbcr_swizzle, pc);
+         if (ycbcr_swizzle == ISL_CHANNEL_SELECT_ZERO)
+            continue;
+
+         unsigned ycbcr_component = channel_to_component(ycbcr_swizzle);
+         ycbcr_comp[ycbcr_component] = nir_channel(builder, plane_sample, pc);
+
+         /* Also compute the number of bits for each component. */
+         const struct isl_format_layout *isl_layout =
+            isl_format_get_layout(plane_format->isl_format);
+         ycbcr_bpcs[ycbcr_component] = isl_layout->channels_array[pc].bits;
+      }
+   }
+
+   /* Now remaps components to the order specified by the conversion. */
+   nir_ssa_def *swizzled_comp[4] = { NULL, };
+   uint32_t swizzled_bpcs[4] = { 0, };
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(state.conversion->mapping); i++) {
+      /* Maps to components in |ycbcr_comp| */
+      static const uint32_t swizzle_mapping[] = {
+         [VK_COMPONENT_SWIZZLE_ZERO] = 4,
+         [VK_COMPONENT_SWIZZLE_ONE]  = 3,
+         [VK_COMPONENT_SWIZZLE_R]    = 0,
+         [VK_COMPONENT_SWIZZLE_G]    = 1,
+         [VK_COMPONENT_SWIZZLE_B]    = 2,
+         [VK_COMPONENT_SWIZZLE_A]    = 3,
+      };
+      const VkComponentSwizzle m = state.conversion->mapping[i];
+
+      if (m == VK_COMPONENT_SWIZZLE_IDENTITY) {
+         swizzled_comp[i] = ycbcr_comp[i];
+         swizzled_bpcs[i] = ycbcr_bpcs[i];
+      } else {
+         swizzled_comp[i] = ycbcr_comp[swizzle_mapping[m]];
+         swizzled_bpcs[i] = ycbcr_bpcs[swizzle_mapping[m]];
+      }
+   }
+
+   nir_ssa_def *result = nir_vec(builder, swizzled_comp, 4);
+   if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) {
+      result = nir_convert_ycbcr_to_rgb(builder,
+                                        state.conversion->ycbcr_model,
+                                        state.conversion->ycbcr_range,
+                                        result,
+                                        swizzled_bpcs);
+   }
+
+   nir_ssa_def_rewrite_uses(&tex->dest.ssa, result);
+   nir_instr_remove(&tex->instr);
+
+   return true;
+}
+
+bool
+anv_nir_lower_ycbcr_textures(nir_shader *shader,
+                             const struct anv_pipeline_layout *layout)
+{
+   return nir_shader_instructions_pass(shader,
+                                       anv_nir_lower_ycbcr_textures_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       (void *)layout);
+}
diff --git a/src/intel/vulkan_hasvk/anv_perf.c b/src/intel/vulkan_hasvk/anv_perf.c
new file mode 100644
index 00000000000..36c4c30e381
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_perf.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "anv_private.h"
+#include "vk_util.h"
+
+#include "perf/intel_perf.h"
+#include "perf/intel_perf_mdapi.h"
+
+#include "util/mesa-sha1.h"
+
+void
+anv_physical_device_init_perf(struct anv_physical_device *device, int fd)
+{
+   const struct intel_device_info *devinfo = &device->info;
+
+   device->perf = NULL;
+
+   /* We need self modifying batches. The i915 parser prevents it on
+    * Gfx7.5 :( maybe one day.
+    */
+   if (devinfo->ver < 8)
+      return;
+
+   struct intel_perf_config *perf = intel_perf_new(NULL);
+
+   intel_perf_init_metrics(perf, &device->info, fd,
+                           false /* pipeline statistics */,
+                           true /* register snapshots */);
+
+   if (!perf->n_queries)
+      goto err;
+
+   /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
+    * perf revision 2.
+    */
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
+      if (!intel_perf_has_hold_preemption(perf))
+         goto err;
+   }
+
+   device->perf = perf;
+
+   /* Compute the number of commands we need to implement a performance
+    * query.
+    */
+   const struct intel_perf_query_field_layout *layout = &perf->query_layout;
+   device->n_perf_query_commands = 0;
+   for (uint32_t f = 0; f < layout->n_fields; f++) {
+      struct intel_perf_query_field *field = &layout->fields[f];
+
+      switch (field->type) {
+      case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
+         device->n_perf_query_commands++;
+         break;
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
+         device->n_perf_query_commands += field->size / 4;
+         break;
+      default:
+         unreachable("Unhandled register type");
+      }
+   }
+   device->n_perf_query_commands *= 2; /* Begin & End */
+   device->n_perf_query_commands += 1; /* availability */
+
+   return;
+
+ err:
+   ralloc_free(perf);
+}
+
+void
+anv_device_perf_init(struct anv_device *device)
+{
+   device->perf_fd = -1;
+}
+
+static int
+anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
+{
+   uint64_t properties[DRM_I915_PERF_PROP_MAX * 2];
+   struct drm_i915_perf_open_param param;
+   int p = 0, stream_fd;
+
+   properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA;
+   properties[p++] = true;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET;
+   properties[p++] = metric_id;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
+   properties[p++] = device->info->ver >= 8 ?
+      I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
+      I915_OA_FORMAT_A45_B8_C8;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
+   properties[p++] = 31; /* slowest sampling period */
+
+   properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE;
+   properties[p++] = device->context_id;
+
+   properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION;
+   properties[p++] = true;
+
+   /* If global SSEU is available, pin it to the default. This will ensure on
+    * Gfx11 for instance we use the full EU array. Initially when perf was
+    * enabled we would use only half on Gfx11 because of functional
+    * requirements.
+    *
+    * Temporary disable this option on Gfx12.5+, kernel doesn't appear to
+    * support it.
+    */
+   if (intel_perf_has_global_sseu(device->physical->perf) &&
+       device->info->verx10 < 125) {
+      properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU;
+      properties[p++] = (uintptr_t) &device->physical->perf->sseu;
+   }
+
+   memset(&param, 0, sizeof(param));
+   param.flags = 0;
+   param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK;
+   param.properties_ptr = (uintptr_t)properties;
+   param.num_properties = p / 2;
+
+   stream_fd = intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, &param);
+   return stream_fd;
+}
+
+/* VK_INTEL_performance_query */
+VkResult anv_InitializePerformanceApiINTEL(
+    VkDevice                                    _device,
+    const VkInitializePerformanceApiInfoINTEL*  pInitializeInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (!device->physical->perf)
+      return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+   /* Not much to do here */
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetPerformanceParameterINTEL(
+    VkDevice                                    _device,
+    VkPerformanceParameterTypeINTEL             parameter,
+    VkPerformanceValueINTEL*                    pValue)
+{
+      ANV_FROM_HANDLE(anv_device, device, _device);
+
+      if (!device->physical->perf)
+         return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+      VkResult result = VK_SUCCESS;
+      switch (parameter) {
+      case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL:
+         pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL;
+         pValue->data.valueBool = VK_TRUE;
+         break;
+
+      case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL:
+         pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL;
+         pValue->data.value32 = 25;
+         break;
+
+      default:
+         result = VK_ERROR_FEATURE_NOT_PRESENT;
+         break;
+      }
+
+      return result;
+}
+
+VkResult anv_CmdSetPerformanceMarkerINTEL(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceMarkerInfoINTEL*         pMarkerInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->intel_perf_marker = pMarkerInfo->marker;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_AcquirePerformanceConfigurationINTEL(
+    VkDevice                                    _device,
+    const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo,
+    VkPerformanceConfigurationINTEL*            pConfiguration)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_performance_configuration_intel *config;
+
+   config = vk_object_alloc(&device->vk, NULL, sizeof(*config),
+                            VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL);
+   if (!config)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
+      config->register_config =
+         intel_perf_load_configuration(device->physical->perf, device->fd,
+                                     INTEL_PERF_QUERY_GUID_MDAPI);
+      if (!config->register_config) {
+         vk_object_free(&device->vk, NULL, config);
+         return VK_INCOMPLETE;
+      }
+
+      int ret =
+         intel_perf_store_configuration(device->physical->perf, device->fd,
+                                      config->register_config, NULL /* guid */);
+      if (ret < 0) {
+         ralloc_free(config->register_config);
+         vk_object_free(&device->vk, NULL, config);
+         return VK_INCOMPLETE;
+      }
+
+      config->config_id = ret;
+   }
+
+   *pConfiguration = anv_performance_configuration_intel_to_handle(config);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_ReleasePerformanceConfigurationINTEL(
+    VkDevice                                    _device,
+    VkPerformanceConfigurationINTEL             _configuration)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
+
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG))
+      intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config->config_id);
+
+   ralloc_free(config->register_config);
+
+   vk_object_free(&device->vk, NULL, config);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_QueueSetPerformanceConfigurationINTEL(
+    VkQueue                                     _queue,
+    VkPerformanceConfigurationINTEL             _configuration)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
+   struct anv_device *device = queue->device;
+
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
+      if (device->perf_fd < 0) {
+         device->perf_fd = anv_device_perf_open(device, config->config_id);
+         if (device->perf_fd < 0)
+            return VK_ERROR_INITIALIZATION_FAILED;
+      } else {
+         int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                               (void *)(uintptr_t) config->config_id);
+         if (ret < 0)
+            return vk_device_set_lost(&device->vk, "i915-perf config failed: %m");
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_UninitializePerformanceApiINTEL(
+    VkDevice                                    _device)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (device->perf_fd >= 0) {
+      close(device->perf_fd);
+      device->perf_fd = -1;
+   }
+}
+
+/* VK_KHR_performance_query */
+static const VkPerformanceCounterUnitKHR
+intel_perf_counter_unit_to_vk_unit[] = {
+   [INTEL_PERF_COUNTER_UNITS_BYTES]                                = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
+   [INTEL_PERF_COUNTER_UNITS_HZ]                                   = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
+   [INTEL_PERF_COUNTER_UNITS_NS]                                   = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR,
+   [INTEL_PERF_COUNTER_UNITS_US]                                   = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, /* todo */
+   [INTEL_PERF_COUNTER_UNITS_PIXELS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_TEXELS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_THREADS]                              = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_PERCENT]                              = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
+   [INTEL_PERF_COUNTER_UNITS_MESSAGES]                             = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_NUMBER]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_CYCLES]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_EVENTS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_UTILIZATION]                          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES]           = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE]           = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+};
+
+static const VkPerformanceCounterStorageKHR
+intel_perf_counter_data_type_to_vk_storage[] = {
+   [INTEL_PERF_COUNTER_DATA_TYPE_BOOL32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
+   [INTEL_PERF_COUNTER_DATA_TYPE_UINT32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
+   [INTEL_PERF_COUNTER_DATA_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+   [INTEL_PERF_COUNTER_DATA_TYPE_FLOAT]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+   [INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR,
+};
+
+VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    uint32_t*                                   pCounterCount,
+    VkPerformanceCounterKHR*                    pCounters,
+    VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   struct intel_perf_config *perf = pdevice->perf;
+
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
+                          pCounterDescriptions, &desc_count);
+
+   /* We cannot support performance queries on anything other than RCS,
+    * because the MI_REPORT_PERF_COUNT command is not available on other
+    * engines.
+    */
+   struct anv_queue_family *queue_family =
+      &pdevice->queue.families[queueFamilyIndex];
+   if (queue_family->engine_class != I915_ENGINE_CLASS_RENDER)
+      return vk_outarray_status(&out);
+
+   for (int c = 0; c < (perf ? perf->n_counters : 0); c++) {
+      const struct intel_perf_query_counter *intel_counter = perf->counter_infos[c].counter;
+
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+         counter->unit = intel_perf_counter_unit_to_vk_unit[intel_counter->units];
+         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+         counter->storage = intel_perf_counter_data_type_to_vk_storage[intel_counter->data_type];
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(intel_counter->symbol_name,
+                            strlen(intel_counter->symbol_name),
+                            sha1_result);
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
+         desc->flags = 0; /* None so far. */
+         snprintf(desc->name, sizeof(desc->name), "%s", intel_counter->name);
+         snprintf(desc->category, sizeof(desc->category), "%s", intel_counter->category);
+         snprintf(desc->description, sizeof(desc->description), "%s", intel_counter->desc);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+void anv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
+    uint32_t*                                   pNumPasses)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   struct intel_perf_config *perf = pdevice->perf;
+
+   if (!perf) {
+      *pNumPasses = 0;
+      return;
+   }
+
+   *pNumPasses = intel_perf_get_n_passes(perf,
+                                       pPerformanceQueryCreateInfo->pCounterIndices,
+                                       pPerformanceQueryCreateInfo->counterIndexCount,
+                                       NULL);
+}
+
+VkResult anv_AcquireProfilingLockKHR(
+    VkDevice                                    _device,
+    const VkAcquireProfilingLockInfoKHR*        pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct intel_perf_config *perf = device->physical->perf;
+   struct intel_perf_query_info *first_metric_set = &perf->queries[0];
+   int fd = -1;
+
+   assert(device->perf_fd == -1);
+
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
+      fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id);
+      if (fd < 0)
+         return VK_TIMEOUT;
+   }
+
+   device->perf_fd = fd;
+   return VK_SUCCESS;
+}
+
+void anv_ReleaseProfilingLockKHR(
+    VkDevice                                    _device)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
+      assert(device->perf_fd >= 0);
+      close(device->perf_fd);
+   }
+   device->perf_fd = -1;
+}
+
+void
+anv_perf_write_pass_results(struct intel_perf_config *perf,
+                            struct anv_query_pool *pool, uint32_t pass,
+                            const struct intel_perf_query_result *accumulated_results,
+                            union VkPerformanceCounterResultKHR *results)
+{
+   for (uint32_t c = 0; c < pool->n_counters; c++) {
+      const struct intel_perf_counter_pass *counter_pass = &pool->counter_pass[c];
+
+      if (counter_pass->pass != pass)
+         continue;
+
+      switch (pool->pass_query[pass]->kind) {
+      case INTEL_PERF_QUERY_TYPE_PIPELINE: {
+         assert(counter_pass->counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64);
+         uint32_t accu_offset = counter_pass->counter->offset / sizeof(uint64_t);
+         results[c].uint64 = accumulated_results->accumulator[accu_offset];
+         break;
+      }
+
+      case INTEL_PERF_QUERY_TYPE_OA:
+      case INTEL_PERF_QUERY_TYPE_RAW:
+         switch (counter_pass->counter->data_type) {
+         case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+            results[c].uint64 =
+               counter_pass->counter->oa_counter_read_uint64(perf,
+                                                             counter_pass->query,
+                                                             accumulated_results);
+            break;
+         case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+            results[c].float32 =
+               counter_pass->counter->oa_counter_read_float(perf,
+                                                            counter_pass->query,
+                                                            accumulated_results);
+            break;
+         default:
+            /* So far we aren't using uint32, double or bool32... */
+            unreachable("unexpected counter data type");
+         }
+         break;
+
+      default:
+         unreachable("invalid query type");
+      }
+
+      /* The Vulkan extension only has nanoseconds as a unit */
+      if (counter_pass->counter->units == INTEL_PERF_COUNTER_UNITS_US) {
+         assert(counter_pass->counter->data_type == INTEL_PERF_COUNTER_DATA_TYPE_UINT64);
+         results[c].uint64 *= 1000;
+      }
+   }
+}
diff --git a/src/intel/vulkan_hasvk/anv_pipeline.c b/src/intel/vulkan_hasvk/anv_pipeline.c
new file mode 100644
index 00000000000..1765b33070d
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_pipeline.c
@@ -0,0 +1,3300 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "util/mesa-sha1.h"
+#include "util/os_time.h"
+#include "common/intel_l3_config.h"
+#include "common/intel_disasm.h"
+#include "common/intel_sample_positions.h"
+#include "anv_private.h"
+#include "compiler/brw_nir.h"
+#include "compiler/brw_nir_rt.h"
+#include "anv_nir.h"
+#include "nir/nir_xfb_info.h"
+#include "spirv/nir_spirv.h"
+#include "vk_pipeline.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+
+/* Needed for SWIZZLE macros */
+#include "program/prog_instruction.h"
+
+/* Eventually, this will become part of anv_CreateShader.  Unfortunately,
+ * we can't do that yet because we don't have the ability to copy nir.
+ */
+static nir_shader *
+anv_shader_stage_to_nir(struct anv_device *device,
+                        const VkPipelineShaderStageCreateInfo *stage_info,
+                        void *mem_ctx)
+{
+   const struct anv_physical_device *pdevice = device->physical;
+   const struct anv_instance *instance = pdevice->instance;
+   const struct brw_compiler *compiler = pdevice->compiler;
+   gl_shader_stage stage = vk_to_mesa_shader_stage(stage_info->stage);
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[stage];
+
+   const struct spirv_to_nir_options spirv_options = {
+      .caps = {
+         .demote_to_helper_invocation = true,
+         .derivative_group = true,
+         .descriptor_array_dynamic_indexing = true,
+         .descriptor_array_non_uniform_indexing = true,
+         .descriptor_indexing = true,
+         .device_group = true,
+         .draw_parameters = true,
+         .float16 = pdevice->info.ver >= 8,
+         .float32_atomic_add = pdevice->info.has_lsc,
+         .float32_atomic_min_max = pdevice->info.ver >= 9,
+         .float64 = pdevice->info.ver >= 8,
+         .float64_atomic_min_max = pdevice->info.has_lsc,
+         .fragment_shader_sample_interlock = pdevice->info.ver >= 9,
+         .fragment_shader_pixel_interlock = pdevice->info.ver >= 9,
+         .geometry_streams = true,
+         /* When using Vulkan 1.3 or KHR_format_feature_flags2 is enabled, the
+          * read/write without format is per format, so just report true. It's
+          * up to the application to check.
+          */
+         .image_read_without_format = instance->vk.app_info.api_version >= VK_API_VERSION_1_3 || device->vk.enabled_extensions.KHR_format_feature_flags2,
+         .image_write_without_format = true,
+         .int8 = pdevice->info.ver >= 8,
+         .int16 = pdevice->info.ver >= 8,
+         .int64 = pdevice->info.ver >= 8,
+         .int64_atomics = pdevice->info.ver >= 9 && pdevice->use_softpin,
+         .integer_functions2 = pdevice->info.ver >= 8,
+         .mesh_shading_nv = pdevice->vk.supported_extensions.NV_mesh_shader,
+         .min_lod = true,
+         .multiview = true,
+         .physical_storage_buffer_address = pdevice->has_a64_buffer_access,
+         .post_depth_coverage = pdevice->info.ver >= 9,
+         .runtime_descriptor_array = true,
+         .float_controls = pdevice->info.ver >= 8,
+         .ray_query = pdevice->info.has_ray_tracing,
+         .ray_tracing = pdevice->info.has_ray_tracing,
+         .shader_clock = true,
+         .shader_viewport_index_layer = true,
+         .stencil_export = pdevice->info.ver >= 9,
+         .storage_8bit = pdevice->info.ver >= 8,
+         .storage_16bit = pdevice->info.ver >= 8,
+         .subgroup_arithmetic = true,
+         .subgroup_basic = true,
+         .subgroup_ballot = true,
+         .subgroup_dispatch = true,
+         .subgroup_quad = true,
+         .subgroup_uniform_control_flow = true,
+         .subgroup_shuffle = true,
+         .subgroup_vote = true,
+         .tessellation = true,
+         .transform_feedback = pdevice->info.ver >= 8,
+         .variable_pointers = true,
+         .vk_memory_model = true,
+         .vk_memory_model_device_scope = true,
+         .workgroup_memory_explicit_layout = true,
+         .fragment_shading_rate = pdevice->info.ver >= 11,
+      },
+      .ubo_addr_format =
+         anv_nir_ubo_addr_format(pdevice, device->robust_buffer_access),
+      .ssbo_addr_format =
+          anv_nir_ssbo_addr_format(pdevice, device->robust_buffer_access),
+      .phys_ssbo_addr_format = nir_address_format_64bit_global,
+      .push_const_addr_format = nir_address_format_logical,
+
+      /* TODO: Consider changing this to an address format that has the NULL
+       * pointer equals to 0.  That might be a better format to play nice
+       * with certain code / code generators.
+       */
+      .shared_addr_format = nir_address_format_32bit_offset,
+   };
+
+   nir_shader *nir;
+   VkResult result =
+      vk_pipeline_shader_stage_to_nir(&device->vk, stage_info,
+                                      &spirv_options, nir_options,
+                                      mem_ctx, &nir);
+   if (result != VK_SUCCESS)
+      return NULL;
+
+   if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) {
+      fprintf(stderr, "NIR (from SPIR-V) for %s shader:\n",
+              gl_shader_stage_name(stage));
+      nir_print_shader(nir, stderr);
+   }
+
+   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+              nir_shader_get_entrypoint(nir), true, false);
+
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .point_coord = true,
+   };
+   NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
+   const nir_opt_access_options opt_access_options = {
+      .is_vulkan = true,
+      .infer_non_readable = true,
+   };
+   NIR_PASS(_, nir, nir_opt_access, &opt_access_options);
+
+   NIR_PASS(_, nir, nir_lower_frexp);
+
+   /* Vulkan uses the separate-shader linking model */
+   nir->info.separate_shader = true;
+
+   brw_preprocess_nir(compiler, nir, NULL);
+
+   return nir;
+}
+
+VkResult
+anv_pipeline_init(struct anv_pipeline *pipeline,
+                  struct anv_device *device,
+                  enum anv_pipeline_type type,
+                  VkPipelineCreateFlags flags,
+                  const VkAllocationCallbacks *pAllocator)
+{
+   VkResult result;
+
+   memset(pipeline, 0, sizeof(*pipeline));
+
+   vk_object_base_init(&device->vk, &pipeline->base,
+                       VK_OBJECT_TYPE_PIPELINE);
+   pipeline->device = device;
+
+   /* It's the job of the child class to provide actual backing storage for
+    * the batch by setting batch.start, batch.next, and batch.end.
+    */
+   pipeline->batch.alloc = pAllocator ? pAllocator : &device->vk.alloc;
+   pipeline->batch.relocs = &pipeline->batch_relocs;
+   pipeline->batch.status = VK_SUCCESS;
+
+   result = anv_reloc_list_init(&pipeline->batch_relocs,
+                                pipeline->batch.alloc);
+   if (result != VK_SUCCESS)
+      return result;
+
+   pipeline->mem_ctx = ralloc_context(NULL);
+
+   pipeline->type = type;
+   pipeline->flags = flags;
+
+   util_dynarray_init(&pipeline->executables, pipeline->mem_ctx);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_pipeline_finish(struct anv_pipeline *pipeline,
+                    struct anv_device *device,
+                    const VkAllocationCallbacks *pAllocator)
+{
+   anv_reloc_list_finish(&pipeline->batch_relocs,
+                         pAllocator ? pAllocator : &device->vk.alloc);
+   ralloc_free(pipeline->mem_ctx);
+   vk_object_base_finish(&pipeline->base);
+}
+
+void anv_DestroyPipeline(
+    VkDevice                                    _device,
+    VkPipeline                                  _pipeline,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
+   if (!pipeline)
+      return;
+
+   switch (pipeline->type) {
+   case ANV_PIPELINE_GRAPHICS: {
+      struct anv_graphics_pipeline *gfx_pipeline =
+         anv_pipeline_to_graphics(pipeline);
+
+      for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->shaders); s++) {
+         if (gfx_pipeline->shaders[s])
+            anv_shader_bin_unref(device, gfx_pipeline->shaders[s]);
+      }
+      break;
+   }
+
+   case ANV_PIPELINE_COMPUTE: {
+      struct anv_compute_pipeline *compute_pipeline =
+         anv_pipeline_to_compute(pipeline);
+
+      if (compute_pipeline->cs)
+         anv_shader_bin_unref(device, compute_pipeline->cs);
+
+      break;
+   }
+
+   case ANV_PIPELINE_RAY_TRACING: {
+      struct anv_ray_tracing_pipeline *rt_pipeline =
+         anv_pipeline_to_ray_tracing(pipeline);
+
+      util_dynarray_foreach(&rt_pipeline->shaders,
+                            struct anv_shader_bin *, shader) {
+         anv_shader_bin_unref(device, *shader);
+      }
+      break;
+   }
+
+   default:
+      unreachable("invalid pipeline type");
+   }
+
+   anv_pipeline_finish(pipeline, device, pAllocator);
+   vk_free2(&device->vk.alloc, pAllocator, pipeline);
+}
+
+static void
+populate_sampler_prog_key(const struct intel_device_info *devinfo,
+                          struct brw_sampler_prog_key_data *key)
+{
+   /* Almost all multisampled textures are compressed.  The only time when we
+    * don't compress a multisampled texture is for 16x MSAA with a surface
+    * width greater than 8k which is a bit of an edge case.  Since the sampler
+    * just ignores the MCS parameter to ld2ms when MCS is disabled, it's safe
+    * to tell the compiler to always assume compression.
+    */
+   key->compressed_multisample_layout_mask = ~0;
+
+   /* SkyLake added support for 16x MSAA.  With this came a new message for
+    * reading from a 16x MSAA surface with compression.  The new message was
+    * needed because now the MCS data is 64 bits instead of 32 or lower as is
+    * the case for 8x, 4x, and 2x.  The key->msaa_16 bit-field controls which
+    * message we use.  Fortunately, the 16x message works for 8x, 4x, and 2x
+    * so we can just use it unconditionally.  This may not be quite as
+    * efficient but it saves us from recompiling.
+    */
+   if (devinfo->ver >= 9)
+      key->msaa_16 = ~0;
+
+   /* XXX: Handle texture swizzle on HSW- */
+   for (int i = 0; i < BRW_MAX_SAMPLERS; i++) {
+      /* Assume color sampler, no swizzling. (Works for BDW+) */
+      key->swizzles[i] = SWIZZLE_XYZW;
+   }
+}
+
+static void
+populate_base_prog_key(const struct anv_device *device,
+                       bool robust_buffer_acccess,
+                       struct brw_base_prog_key *key)
+{
+   key->robust_buffer_access = robust_buffer_acccess;
+   key->limit_trig_input_range =
+      device->physical->instance->limit_trig_input_range;
+
+   populate_sampler_prog_key(device->info, &key->tex);
+}
+
+static void
+populate_vs_prog_key(const struct anv_device *device,
+                     bool robust_buffer_acccess,
+                     struct brw_vs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+
+   /* XXX: Handle vertex input work-arounds */
+
+   /* XXX: Handle sampler_prog_key */
+}
+
+static void
+populate_tcs_prog_key(const struct anv_device *device,
+                      bool robust_buffer_acccess,
+                      unsigned input_vertices,
+                      struct brw_tcs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+
+   key->input_vertices = input_vertices;
+}
+
+static void
+populate_tes_prog_key(const struct anv_device *device,
+                      bool robust_buffer_acccess,
+                      struct brw_tes_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+}
+
+static void
+populate_gs_prog_key(const struct anv_device *device,
+                     bool robust_buffer_acccess,
+                     struct brw_gs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+}
+
+static bool
+pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline,
+                          const BITSET_WORD *dynamic,
+                          const struct vk_multisample_state *ms,
+                          const struct vk_fragment_shading_rate_state *fsr)
+{
+   /* The Vulkan 1.2.199 spec says:
+    *
+    *    "If any of the following conditions are met, Cxy' must be set to
+    *    {1,1}:
+    *
+    *     * If Sample Shading is enabled.
+    *     * [...]"
+    *
+    * And "sample shading" is defined as follows:
+    *
+    *    "Sample shading is enabled for a graphics pipeline:
+    *
+    *     * If the interface of the fragment shader entry point of the
+    *       graphics pipeline includes an input variable decorated with
+    *       SampleId or SamplePosition. In this case minSampleShadingFactor
+    *       takes the value 1.0.
+    *
+    *     * Else if the sampleShadingEnable member of the
+    *       VkPipelineMultisampleStateCreateInfo structure specified when
+    *       creating the graphics pipeline is set to VK_TRUE. In this case
+    *       minSampleShadingFactor takes the value of
+    *       VkPipelineMultisampleStateCreateInfo::minSampleShading.
+    *
+    *    Otherwise, sample shading is considered disabled."
+    *
+    * The first bullet above is handled by the back-end compiler because those
+    * inputs both force per-sample dispatch.  The second bullet is handled
+    * here.  Note that this sample shading being enabled has nothing to do
+    * with minSampleShading.
+    */
+   if (ms != NULL && ms->sample_shading_enable)
+      return false;
+
+   /* Not dynamic & pipeline has a 1x1 fragment shading rate with no
+    * possibility for element of the pipeline to change the value.
+    */
+   if (!BITSET_TEST(dynamic, MESA_VK_DYNAMIC_FSR) &&
+       fsr->fragment_size.width <= 1 &&
+       fsr->fragment_size.height <= 1 &&
+       fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
+       fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
+      return false;
+
+   return true;
+}
+
+static void
+populate_task_prog_key(const struct anv_device *device,
+                       bool robust_buffer_access,
+                       struct brw_task_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
+}
+
+static void
+populate_mesh_prog_key(const struct anv_device *device,
+                       bool robust_buffer_access,
+                       struct brw_mesh_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
+}
+
+static void
+populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,
+                     bool robust_buffer_acccess,
+                     const BITSET_WORD *dynamic,
+                     const struct vk_multisample_state *ms,
+                     const struct vk_fragment_shading_rate_state *fsr,
+                     const struct vk_render_pass_state *rp,
+                     struct brw_wm_prog_key *key)
+{
+   const struct anv_device *device = pipeline->base.device;
+
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+
+   /* We set this to 0 here and set to the actual value before we call
+    * brw_compile_fs.
+    */
+   key->input_slots_valid = 0;
+
+   /* XXX Vulkan doesn't appear to specify */
+   key->clamp_fragment_color = false;
+
+   key->ignore_sample_mask_out = false;
+
+   assert(rp->color_attachment_count <= MAX_RTS);
+   /* Consider all inputs as valid until look at the NIR variables. */
+   key->color_outputs_valid = (1u << rp->color_attachment_count) - 1;
+   key->nr_color_regions = rp->color_attachment_count;
+
+   /* To reduce possible shader recompilations we would need to know if
+    * there is a SampleMask output variable to compute if we should emit
+    * code to workaround the issue that hardware disables alpha to coverage
+    * when there is SampleMask output.
+    */
+   key->alpha_to_coverage = ms != NULL && ms->alpha_to_coverage_enable;
+
+   /* Vulkan doesn't support fixed-function alpha test */
+   key->alpha_test_replicate_alpha = false;
+
+   if (ms != NULL) {
+      /* We should probably pull this out of the shader, but it's fairly
+       * harmless to compute it and then let dead-code take care of it.
+       */
+      if (ms->rasterization_samples > 1) {
+         key->persample_interp = ms->sample_shading_enable &&
+            (ms->min_sample_shading * ms->rasterization_samples) > 1;
+         key->multisample_fbo = true;
+      }
+
+      if (device->physical->instance->sample_mask_out_opengl_behaviour)
+         key->ignore_sample_mask_out = !key->multisample_fbo;
+   }
+
+   key->coarse_pixel =
+      !key->persample_interp &&
+      device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+      pipeline_has_coarse_pixel(pipeline, dynamic, ms, fsr);
+}
+
+static void
+populate_cs_prog_key(const struct anv_device *device,
+                     bool robust_buffer_acccess,
+                     struct brw_cs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_acccess, &key->base);
+}
+
+static void
+populate_bs_prog_key(const struct anv_device *device,
+                     bool robust_buffer_access,
+                     struct brw_bs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_base_prog_key(device, robust_buffer_access, &key->base);
+}
+
+struct anv_pipeline_stage {
+   gl_shader_stage stage;
+
+   const VkPipelineShaderStageCreateInfo *info;
+
+   unsigned char shader_sha1[20];
+
+   union brw_any_prog_key key;
+
+   struct {
+      gl_shader_stage stage;
+      unsigned char sha1[20];
+   } cache_key;
+
+   nir_shader *nir;
+
+   struct anv_pipeline_binding surface_to_descriptor[256];
+   struct anv_pipeline_binding sampler_to_descriptor[256];
+   struct anv_pipeline_bind_map bind_map;
+
+   union brw_any_prog_data prog_data;
+
+   uint32_t num_stats;
+   struct brw_compile_stats stats[3];
+   char *disasm[3];
+
+   VkPipelineCreationFeedback feedback;
+
+   const unsigned *code;
+
+   struct anv_shader_bin *bin;
+};
+
+static void
+anv_pipeline_hash_graphics(struct anv_graphics_pipeline *pipeline,
+                           struct anv_pipeline_layout *layout,
+                           struct anv_pipeline_stage *stages,
+                           unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   _mesa_sha1_update(&ctx, &pipeline->view_mask,
+                     sizeof(pipeline->view_mask));
+
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   const bool rba = pipeline->base.device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (stages[s].info) {
+         _mesa_sha1_update(&ctx, stages[s].shader_sha1,
+                           sizeof(stages[s].shader_sha1));
+         _mesa_sha1_update(&ctx, &stages[s].key, brw_prog_key_size(s));
+      }
+   }
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+static void
+anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
+                          struct anv_pipeline_layout *layout,
+                          struct anv_pipeline_stage *stage,
+                          unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   const struct anv_device *device = pipeline->base.device;
+
+   const bool rba = device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
+   const bool afs = device->physical->instance->assume_full_subgroups;
+   _mesa_sha1_update(&ctx, &afs, sizeof(afs));
+
+   _mesa_sha1_update(&ctx, stage->shader_sha1,
+                     sizeof(stage->shader_sha1));
+   _mesa_sha1_update(&ctx, &stage->key.cs, sizeof(stage->key.cs));
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+static void
+anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline,
+                                     struct anv_pipeline_layout *layout,
+                                     struct anv_pipeline_stage *stage,
+                                     unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   if (layout != NULL)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   const bool rba = pipeline->base.device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
+   _mesa_sha1_update(&ctx, stage->shader_sha1, sizeof(stage->shader_sha1));
+   _mesa_sha1_update(&ctx, &stage->key, sizeof(stage->key.bs));
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+static void
+anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *pipeline,
+                                              struct anv_pipeline_layout *layout,
+                                              struct anv_pipeline_stage *intersection,
+                                              struct anv_pipeline_stage *any_hit,
+                                              unsigned char *sha1_out)
+{
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   if (layout != NULL)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   const bool rba = pipeline->base.device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+
+   _mesa_sha1_update(&ctx, intersection->shader_sha1, sizeof(intersection->shader_sha1));
+   _mesa_sha1_update(&ctx, &intersection->key, sizeof(intersection->key.bs));
+   _mesa_sha1_update(&ctx, any_hit->shader_sha1, sizeof(any_hit->shader_sha1));
+   _mesa_sha1_update(&ctx, &any_hit->key, sizeof(any_hit->key.bs));
+
+   _mesa_sha1_final(&ctx, sha1_out);
+}
+
+static nir_shader *
+anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
+                           struct vk_pipeline_cache *cache,
+                           void *mem_ctx,
+                           struct anv_pipeline_stage *stage)
+{
+   const struct brw_compiler *compiler =
+      pipeline->device->physical->compiler;
+   const nir_shader_compiler_options *nir_options =
+      compiler->nir_options[stage->stage];
+   nir_shader *nir;
+
+   nir = anv_device_search_for_nir(pipeline->device, cache,
+                                   nir_options,
+                                   stage->shader_sha1,
+                                   mem_ctx);
+   if (nir) {
+      assert(nir->info.stage == stage->stage);
+      return nir;
+   }
+
+   nir = anv_shader_stage_to_nir(pipeline->device, stage->info, mem_ctx);
+   if (nir) {
+      anv_device_upload_nir(pipeline->device, cache, nir, stage->shader_sha1);
+      return nir;
+   }
+
+   return NULL;
+}
+
+static void
+shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+
+   uint32_t comp_size = glsl_type_is_boolean(type)
+      ? 4 : glsl_get_bit_size(type) / 8;
+   unsigned length = glsl_get_vector_elements(type);
+   *size = comp_size * length,
+   *align = comp_size * (length == 3 ? 4 : length);
+}
+
+static void
+anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
+                       void *mem_ctx,
+                       struct anv_pipeline_stage *stage,
+                       struct anv_pipeline_layout *layout,
+                       bool use_primitive_replication)
+{
+   const struct anv_physical_device *pdevice = pipeline->device->physical;
+   const struct brw_compiler *compiler = pdevice->compiler;
+
+   struct brw_stage_prog_data *prog_data = &stage->prog_data.base;
+   nir_shader *nir = stage->nir;
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS(_, nir, nir_lower_wpos_center);
+      NIR_PASS(_, nir, nir_lower_input_attachments,
+               &(nir_input_attachment_options) {
+                   .use_fragcoord_sysval = true,
+                   .use_layer_id_sysval = true,
+               });
+   }
+
+   NIR_PASS(_, nir, anv_nir_lower_ycbcr_textures, layout);
+
+   if (pipeline->type == ANV_PIPELINE_GRAPHICS) {
+      struct anv_graphics_pipeline *gfx_pipeline =
+         anv_pipeline_to_graphics(pipeline);
+      NIR_PASS(_, nir, anv_nir_lower_multiview, gfx_pipeline->view_mask,
+               use_primitive_replication);
+   }
+
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   NIR_PASS(_, nir, brw_nir_lower_storage_image, compiler->devinfo);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
+            nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
+            nir_address_format_32bit_offset);
+
+   NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info);
+
+   /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
+   NIR_PASS_V(nir, anv_nir_apply_pipeline_layout,
+              pdevice, pipeline->device->robust_buffer_access,
+              layout, &stage->bind_map);
+
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
+            anv_nir_ubo_addr_format(pdevice,
+               pipeline->device->robust_buffer_access));
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+            anv_nir_ssbo_addr_format(pdevice,
+               pipeline->device->robust_buffer_access));
+
+   /* First run copy-prop to get rid of all of the vec() that address
+    * calculations often create and then constant-fold so that, when we
+    * get to anv_nir_lower_ubo_loads, we can detect constant offsets.
+    */
+   NIR_PASS(_, nir, nir_copy_prop);
+   NIR_PASS(_, nir, nir_opt_constant_folding);
+
+   NIR_PASS(_, nir, anv_nir_lower_ubo_loads);
+
+   enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
+      nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access;
+
+   /* In practice, most shaders do not have non-uniform-qualified
+    * accesses (see
+    * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
+    * thus a cheaper and likely to fail check is run first.
+    */
+   if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
+      NIR_PASS(_, nir, nir_opt_non_uniform_access);
+
+      /* We don't support non-uniform UBOs and non-uniform SSBO access is
+      * handled naturally by falling back to A64 messages.
+      */
+      NIR_PASS(_, nir, nir_lower_non_uniform_access,
+               &(nir_lower_non_uniform_access_options) {
+                  .types = lower_non_uniform_access_types,
+                  .callback = NULL,
+               });
+   }
+
+   NIR_PASS_V(nir, anv_nir_compute_push_layout,
+              pdevice, pipeline->device->robust_buffer_access,
+              prog_data, &stage->bind_map, mem_ctx);
+
+   if (gl_shader_stage_uses_workgroup(nir->info.stage)) {
+      if (!nir->info.shared_memory_explicit_layout) {
+         NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+                  nir_var_mem_shared, shared_type_info);
+      }
+
+      NIR_PASS(_, nir, nir_lower_explicit_io,
+               nir_var_mem_shared, nir_address_format_32bit_offset);
+
+      if (nir->info.zero_initialize_shared_memory &&
+          nir->info.shared_size > 0) {
+         /* The effective Shared Local Memory size is at least 1024 bytes and
+          * is always rounded to a power of two, so it is OK to align the size
+          * used by the shader to chunk_size -- which does simplify the logic.
+          */
+         const unsigned chunk_size = 16;
+         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
+         assert(shared_size <=
+                intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size));
+
+         NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
+                  shared_size, chunk_size);
+      }
+   }
+
+   if (gl_shader_stage_is_compute(nir->info.stage) ||
+       gl_shader_stage_is_mesh(nir->info.stage))
+      NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics);
+
+   stage->nir = nir;
+}
+
+static void
+anv_pipeline_link_vs(const struct brw_compiler *compiler,
+                     struct anv_pipeline_stage *vs_stage,
+                     struct anv_pipeline_stage *next_stage)
+{
+   if (next_stage)
+      brw_nir_link_shaders(compiler, vs_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_vs(const struct brw_compiler *compiler,
+                        void *mem_ctx,
+                        struct anv_graphics_pipeline *pipeline,
+                        struct anv_pipeline_stage *vs_stage)
+{
+   /* When using Primitive Replication for multiview, each view gets its own
+    * position slot.
+    */
+   uint32_t pos_slots =
+      (vs_stage->nir->info.per_view_outputs & VARYING_BIT_POS) ?
+      MAX2(1, util_bitcount(pipeline->view_mask)) : 1;
+
+   /* Only position is allowed to be per-view */
+   assert(!(vs_stage->nir->info.per_view_outputs & ~VARYING_BIT_POS));
+
+   brw_compute_vue_map(compiler->devinfo,
+                       &vs_stage->prog_data.vs.base.vue_map,
+                       vs_stage->nir->info.outputs_written,
+                       vs_stage->nir->info.separate_shader,
+                       pos_slots);
+
+   vs_stage->num_stats = 1;
+
+   struct brw_compile_vs_params params = {
+      .nir = vs_stage->nir,
+      .key = &vs_stage->key.vs,
+      .prog_data = &vs_stage->prog_data.vs,
+      .stats = vs_stage->stats,
+      .log_data = pipeline->base.device,
+   };
+
+   vs_stage->code = brw_compile_vs(compiler, mem_ctx, &params);
+}
+
+static void
+merge_tess_info(struct shader_info *tes_info,
+                const struct shader_info *tcs_info)
+{
+   /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
+    *
+    *    "PointMode. Controls generation of points rather than triangles
+    *     or lines. This functionality defaults to disabled, and is
+    *     enabled if either shader stage includes the execution mode.
+    *
+    * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
+    * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
+    * and OutputVertices, it says:
+    *
+    *    "One mode must be set in at least one of the tessellation
+    *     shader stages."
+    *
+    * So, the fields can be set in either the TCS or TES, but they must
+    * agree if set in both.  Our backend looks at TES, so bitwise-or in
+    * the values from the TCS.
+    */
+   assert(tcs_info->tess.tcs_vertices_out == 0 ||
+          tes_info->tess.tcs_vertices_out == 0 ||
+          tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
+   tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
+
+   assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
+          tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
+          tcs_info->tess.spacing == tes_info->tess.spacing);
+   tes_info->tess.spacing |= tcs_info->tess.spacing;
+
+   assert(tcs_info->tess._primitive_mode == 0 ||
+          tes_info->tess._primitive_mode == 0 ||
+          tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
+   tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
+   tes_info->tess.ccw |= tcs_info->tess.ccw;
+   tes_info->tess.point_mode |= tcs_info->tess.point_mode;
+}
+
+static void
+anv_pipeline_link_tcs(const struct brw_compiler *compiler,
+                      struct anv_pipeline_stage *tcs_stage,
+                      struct anv_pipeline_stage *tes_stage)
+{
+   assert(tes_stage && tes_stage->stage == MESA_SHADER_TESS_EVAL);
+
+   brw_nir_link_shaders(compiler, tcs_stage->nir, tes_stage->nir);
+
+   nir_lower_patch_vertices(tes_stage->nir,
+                            tcs_stage->nir->info.tess.tcs_vertices_out,
+                            NULL);
+
+   /* Copy TCS info into the TES info */
+   merge_tess_info(&tes_stage->nir->info, &tcs_stage->nir->info);
+
+   /* Whacking the key after cache lookup is a bit sketchy, but all of
+    * this comes from the SPIR-V, which is part of the hash used for the
+    * pipeline cache.  So it should be safe.
+    */
+   tcs_stage->key.tcs._tes_primitive_mode =
+      tes_stage->nir->info.tess._primitive_mode;
+   tcs_stage->key.tcs.quads_workaround =
+      compiler->devinfo->ver < 9 &&
+      tes_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_QUADS &&
+      tes_stage->nir->info.tess.spacing == TESS_SPACING_EQUAL;
+}
+
+static void
+anv_pipeline_compile_tcs(const struct brw_compiler *compiler,
+                         void *mem_ctx,
+                         struct anv_device *device,
+                         struct anv_pipeline_stage *tcs_stage,
+                         struct anv_pipeline_stage *prev_stage)
+{
+   tcs_stage->key.tcs.outputs_written =
+      tcs_stage->nir->info.outputs_written;
+   tcs_stage->key.tcs.patch_outputs_written =
+      tcs_stage->nir->info.patch_outputs_written;
+
+   tcs_stage->num_stats = 1;
+
+   struct brw_compile_tcs_params params = {
+      .nir = tcs_stage->nir,
+      .key = &tcs_stage->key.tcs,
+      .prog_data = &tcs_stage->prog_data.tcs,
+      .stats = tcs_stage->stats,
+      .log_data = device,
+   };
+
+   tcs_stage->code = brw_compile_tcs(compiler, mem_ctx, &params);
+}
+
+static void
+anv_pipeline_link_tes(const struct brw_compiler *compiler,
+                      struct anv_pipeline_stage *tes_stage,
+                      struct anv_pipeline_stage *next_stage)
+{
+   if (next_stage)
+      brw_nir_link_shaders(compiler, tes_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_tes(const struct brw_compiler *compiler,
+                         void *mem_ctx,
+                         struct anv_device *device,
+                         struct anv_pipeline_stage *tes_stage,
+                         struct anv_pipeline_stage *tcs_stage)
+{
+   tes_stage->key.tes.inputs_read =
+      tcs_stage->nir->info.outputs_written;
+   tes_stage->key.tes.patch_inputs_read =
+      tcs_stage->nir->info.patch_outputs_written;
+
+   tes_stage->num_stats = 1;
+
+   struct brw_compile_tes_params params = {
+      .nir = tes_stage->nir,
+      .key = &tes_stage->key.tes,
+      .prog_data = &tes_stage->prog_data.tes,
+      .input_vue_map = &tcs_stage->prog_data.tcs.base.vue_map,
+      .stats = tes_stage->stats,
+      .log_data = device,
+   };
+
+   tes_stage->code = brw_compile_tes(compiler, mem_ctx, &params);
+}
+
+static void
+anv_pipeline_link_gs(const struct brw_compiler *compiler,
+                     struct anv_pipeline_stage *gs_stage,
+                     struct anv_pipeline_stage *next_stage)
+{
+   if (next_stage)
+      brw_nir_link_shaders(compiler, gs_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_gs(const struct brw_compiler *compiler,
+                        void *mem_ctx,
+                        struct anv_device *device,
+                        struct anv_pipeline_stage *gs_stage,
+                        struct anv_pipeline_stage *prev_stage)
+{
+   brw_compute_vue_map(compiler->devinfo,
+                       &gs_stage->prog_data.gs.base.vue_map,
+                       gs_stage->nir->info.outputs_written,
+                       gs_stage->nir->info.separate_shader, 1);
+
+   gs_stage->num_stats = 1;
+
+   struct brw_compile_gs_params params = {
+      .nir = gs_stage->nir,
+      .key = &gs_stage->key.gs,
+      .prog_data = &gs_stage->prog_data.gs,
+      .stats = gs_stage->stats,
+      .log_data = device,
+   };
+
+   gs_stage->code = brw_compile_gs(compiler, mem_ctx, &params);
+}
+
+static void
+anv_pipeline_link_task(const struct brw_compiler *compiler,
+                       struct anv_pipeline_stage *task_stage,
+                       struct anv_pipeline_stage *next_stage)
+{
+   assert(next_stage);
+   assert(next_stage->stage == MESA_SHADER_MESH);
+   brw_nir_link_shaders(compiler, task_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_task(const struct brw_compiler *compiler,
+                          void *mem_ctx,
+                          struct anv_device *device,
+                          struct anv_pipeline_stage *task_stage)
+{
+   task_stage->num_stats = 1;
+
+   struct brw_compile_task_params params = {
+      .nir = task_stage->nir,
+      .key = &task_stage->key.task,
+      .prog_data = &task_stage->prog_data.task,
+      .stats = task_stage->stats,
+      .log_data = device,
+   };
+
+   task_stage->code = brw_compile_task(compiler, mem_ctx, &params);
+}
+
+static void
+anv_pipeline_link_mesh(const struct brw_compiler *compiler,
+                       struct anv_pipeline_stage *mesh_stage,
+                       struct anv_pipeline_stage *next_stage)
+{
+   if (next_stage) {
+      brw_nir_link_shaders(compiler, mesh_stage->nir, next_stage->nir);
+   }
+}
+
+static void
+anv_pipeline_compile_mesh(const struct brw_compiler *compiler,
+                          void *mem_ctx,
+                          struct anv_device *device,
+                          struct anv_pipeline_stage *mesh_stage,
+                          struct anv_pipeline_stage *prev_stage)
+{
+   mesh_stage->num_stats = 1;
+
+   struct brw_compile_mesh_params params = {
+      .nir = mesh_stage->nir,
+      .key = &mesh_stage->key.mesh,
+      .prog_data = &mesh_stage->prog_data.mesh,
+      .stats = mesh_stage->stats,
+      .log_data = device,
+   };
+
+   if (prev_stage) {
+      assert(prev_stage->stage == MESA_SHADER_TASK);
+      params.tue_map = &prev_stage->prog_data.task.map;
+   }
+
+   mesh_stage->code = brw_compile_mesh(compiler, mem_ctx, &params);
+}
+
+static void
+anv_pipeline_link_fs(const struct brw_compiler *compiler,
+                     struct anv_pipeline_stage *stage,
+                     const struct vk_render_pass_state *rp)
+{
+   /* Initially the valid outputs value is set to all possible render targets
+    * valid (see populate_wm_prog_key()), before we look at the shader
+    * variables. Here we look at the output variables of the shader an compute
+    * a correct number of render target outputs.
+    */
+   stage->key.wm.color_outputs_valid = 0;
+   nir_foreach_shader_out_variable_safe(var, stage->nir) {
+      if (var->data.location < FRAG_RESULT_DATA0)
+         continue;
+
+      const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
+      const unsigned array_len =
+         glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
+      assert(rt + array_len <= MAX_RTS);
+
+      stage->key.wm.color_outputs_valid |= BITFIELD_RANGE(rt, array_len);
+   }
+   stage->key.wm.color_outputs_valid &=
+      (1u << rp->color_attachment_count) - 1;
+   stage->key.wm.nr_color_regions =
+      util_last_bit(stage->key.wm.color_outputs_valid);
+
+   unsigned num_rt_bindings;
+   struct anv_pipeline_binding rt_bindings[MAX_RTS];
+   if (stage->key.wm.nr_color_regions > 0) {
+      assert(stage->key.wm.nr_color_regions <= MAX_RTS);
+      for (unsigned rt = 0; rt < stage->key.wm.nr_color_regions; rt++) {
+         if (stage->key.wm.color_outputs_valid & BITFIELD_BIT(rt)) {
+            rt_bindings[rt] = (struct anv_pipeline_binding) {
+               .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
+               .index = rt,
+            };
+         } else {
+            /* Setup a null render target */
+            rt_bindings[rt] = (struct anv_pipeline_binding) {
+               .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
+               .index = UINT32_MAX,
+            };
+         }
+      }
+      num_rt_bindings = stage->key.wm.nr_color_regions;
+   } else {
+      /* Setup a null render target */
+      rt_bindings[0] = (struct anv_pipeline_binding) {
+         .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
+         .index = UINT32_MAX,
+      };
+      num_rt_bindings = 1;
+   }
+
+   assert(num_rt_bindings <= MAX_RTS);
+   assert(stage->bind_map.surface_count == 0);
+   typed_memcpy(stage->bind_map.surface_to_descriptor,
+                rt_bindings, num_rt_bindings);
+   stage->bind_map.surface_count += num_rt_bindings;
+}
+
+static void
+anv_pipeline_compile_fs(const struct brw_compiler *compiler,
+                        void *mem_ctx,
+                        struct anv_device *device,
+                        struct anv_pipeline_stage *fs_stage,
+                        struct anv_pipeline_stage *prev_stage)
+{
+   /* TODO: we could set this to 0 based on the information in nir_shader, but
+    * we need this before we call spirv_to_nir.
+    */
+   assert(prev_stage);
+
+   struct brw_compile_fs_params params = {
+      .nir = fs_stage->nir,
+      .key = &fs_stage->key.wm,
+      .prog_data = &fs_stage->prog_data.wm,
+
+      .allow_spilling = true,
+      .stats = fs_stage->stats,
+      .log_data = device,
+   };
+
+   if (prev_stage->stage == MESA_SHADER_MESH) {
+      params.mue_map = &prev_stage->prog_data.mesh.map;
+      /* TODO(mesh): Slots valid, do we even use/rely on it? */
+   } else {
+      fs_stage->key.wm.input_slots_valid =
+         prev_stage->prog_data.vue.vue_map.slots_valid;
+   }
+
+   fs_stage->code = brw_compile_fs(compiler, mem_ctx, &params);
+
+   fs_stage->num_stats = (uint32_t)fs_stage->prog_data.wm.dispatch_8 +
+                         (uint32_t)fs_stage->prog_data.wm.dispatch_16 +
+                         (uint32_t)fs_stage->prog_data.wm.dispatch_32;
+}
+
+static void
+anv_pipeline_add_executable(struct anv_pipeline *pipeline,
+                            struct anv_pipeline_stage *stage,
+                            struct brw_compile_stats *stats,
+                            uint32_t code_offset)
+{
+   char *nir = NULL;
+   if (stage->nir &&
+       (pipeline->flags &
+        VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+      nir = nir_shader_as_str(stage->nir, pipeline->mem_ctx);
+   }
+
+   char *disasm = NULL;
+   if (stage->code &&
+       (pipeline->flags &
+        VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+      char *stream_data = NULL;
+      size_t stream_size = 0;
+      FILE *stream = open_memstream(&stream_data, &stream_size);
+
+      uint32_t push_size = 0;
+      for (unsigned i = 0; i < 4; i++)
+         push_size += stage->bind_map.push_ranges[i].length;
+      if (push_size > 0) {
+         fprintf(stream, "Push constant ranges:\n");
+         for (unsigned i = 0; i < 4; i++) {
+            if (stage->bind_map.push_ranges[i].length == 0)
+               continue;
+
+            fprintf(stream, "    RANGE%d (%dB): ", i,
+                    stage->bind_map.push_ranges[i].length * 32);
+
+            switch (stage->bind_map.push_ranges[i].set) {
+            case ANV_DESCRIPTOR_SET_NULL:
+               fprintf(stream, "NULL");
+               break;
+
+            case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
+               fprintf(stream, "Vulkan push constants and API params");
+               break;
+
+            case ANV_DESCRIPTOR_SET_DESCRIPTORS:
+               fprintf(stream, "Descriptor buffer for set %d (start=%dB)",
+                       stage->bind_map.push_ranges[i].index,
+                       stage->bind_map.push_ranges[i].start * 32);
+               break;
+
+            case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS:
+               unreachable("gl_NumWorkgroups is never pushed");
+
+            case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
+               fprintf(stream, "Inline shader constant data (start=%dB)",
+                       stage->bind_map.push_ranges[i].start * 32);
+               break;
+
+            case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
+               unreachable("Color attachments can't be pushed");
+
+            default:
+               fprintf(stream, "UBO (set=%d binding=%d start=%dB)",
+                       stage->bind_map.push_ranges[i].set,
+                       stage->bind_map.push_ranges[i].index,
+                       stage->bind_map.push_ranges[i].start * 32);
+               break;
+            }
+            fprintf(stream, "\n");
+         }
+         fprintf(stream, "\n");
+      }
+
+      /* Creating this is far cheaper than it looks.  It's perfectly fine to
+       * do it for every binary.
+       */
+      intel_disassemble(&pipeline->device->physical->compiler->isa,
+                        stage->code, code_offset, stream);
+
+      fclose(stream);
+
+      /* Copy it to a ralloc'd thing */
+      disasm = ralloc_size(pipeline->mem_ctx, stream_size + 1);
+      memcpy(disasm, stream_data, stream_size);
+      disasm[stream_size] = 0;
+
+      free(stream_data);
+   }
+
+   const struct anv_pipeline_executable exe = {
+      .stage = stage->stage,
+      .stats = *stats,
+      .nir = nir,
+      .disasm = disasm,
+   };
+   util_dynarray_append(&pipeline->executables,
+                        struct anv_pipeline_executable, exe);
+}
+
+static void
+anv_pipeline_add_executables(struct anv_pipeline *pipeline,
+                             struct anv_pipeline_stage *stage,
+                             struct anv_shader_bin *bin)
+{
+   if (stage->stage == MESA_SHADER_FRAGMENT) {
+      /* We pull the prog data and stats out of the anv_shader_bin because
+       * the anv_pipeline_stage may not be fully populated if we successfully
+       * looked up the shader in a cache.
+       */
+      const struct brw_wm_prog_data *wm_prog_data =
+         (const struct brw_wm_prog_data *)bin->prog_data;
+      struct brw_compile_stats *stats = bin->stats;
+
+      if (wm_prog_data->dispatch_8) {
+         anv_pipeline_add_executable(pipeline, stage, stats++, 0);
+      }
+
+      if (wm_prog_data->dispatch_16) {
+         anv_pipeline_add_executable(pipeline, stage, stats++,
+                                     wm_prog_data->prog_offset_16);
+      }
+
+      if (wm_prog_data->dispatch_32) {
+         anv_pipeline_add_executable(pipeline, stage, stats++,
+                                     wm_prog_data->prog_offset_32);
+      }
+   } else {
+      anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
+   }
+
+   pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries);
+}
+
+static void
+anv_graphics_pipeline_init_keys(struct anv_graphics_pipeline *pipeline,
+                                const struct vk_graphics_pipeline_state *state,
+                                struct anv_pipeline_stage *stages)
+{
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (!stages[s].info)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      vk_pipeline_hash_shader_stage(stages[s].info, stages[s].shader_sha1);
+
+      const struct anv_device *device = pipeline->base.device;
+      switch (stages[s].stage) {
+      case MESA_SHADER_VERTEX:
+         populate_vs_prog_key(device,
+                              pipeline->base.device->robust_buffer_access,
+                              &stages[s].key.vs);
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         populate_tcs_prog_key(device,
+                               pipeline->base.device->robust_buffer_access,
+                               state->ts->patch_control_points,
+                               &stages[s].key.tcs);
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         populate_tes_prog_key(device,
+                               pipeline->base.device->robust_buffer_access,
+                               &stages[s].key.tes);
+         break;
+      case MESA_SHADER_GEOMETRY:
+         populate_gs_prog_key(device,
+                              pipeline->base.device->robust_buffer_access,
+                              &stages[s].key.gs);
+         break;
+      case MESA_SHADER_FRAGMENT: {
+         populate_wm_prog_key(pipeline,
+                              pipeline->base.device->robust_buffer_access,
+                              state->dynamic, state->ms, state->fsr, state->rp,
+                              &stages[s].key.wm);
+         break;
+      }
+      case MESA_SHADER_TASK:
+         populate_task_prog_key(device,
+                                pipeline->base.device->robust_buffer_access,
+                                &stages[s].key.task);
+         break;
+      case MESA_SHADER_MESH:
+         populate_mesh_prog_key(device,
+                                pipeline->base.device->robust_buffer_access,
+                                &stages[s].key.mesh);
+         break;
+      default:
+         unreachable("Invalid graphics shader stage");
+      }
+
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+      stages[s].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+   }
+
+   assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT ||
+          pipeline->active_stages & VK_SHADER_STAGE_MESH_BIT_NV);
+}
+
+static bool
+anv_graphics_pipeline_load_cached_shaders(struct anv_graphics_pipeline *pipeline,
+                                          struct vk_pipeline_cache *cache,
+                                          struct anv_pipeline_stage *stages,
+                                          VkPipelineCreationFeedbackEXT *pipeline_feedback)
+{
+   unsigned found = 0;
+   unsigned cache_hits = 0;
+   for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (!stages[s].info)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      bool cache_hit;
+      struct anv_shader_bin *bin =
+         anv_device_search_for_kernel(pipeline->base.device, cache,
+                                      &stages[s].cache_key,
+                                      sizeof(stages[s].cache_key), &cache_hit);
+      if (bin) {
+         found++;
+         pipeline->shaders[s] = bin;
+      }
+
+      if (cache_hit) {
+         cache_hits++;
+         stages[s].feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      }
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   if (found == __builtin_popcount(pipeline->active_stages)) {
+      if (cache_hits == found) {
+         pipeline_feedback->flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      }
+      /* We found all our shaders in the cache.  We're done. */
+      for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+         if (!stages[s].info)
+            continue;
+
+         anv_pipeline_add_executables(&pipeline->base, &stages[s],
+                                      pipeline->shaders[s]);
+      }
+      return true;
+   } else if (found > 0) {
+      /* We found some but not all of our shaders. This shouldn't happen most
+       * of the time but it can if we have a partially populated pipeline
+       * cache.
+       */
+      assert(found < __builtin_popcount(pipeline->active_stages));
+
+      vk_perf(VK_LOG_OBJS(cache ? &cache->base :
+                                  &pipeline->base.device->vk.base),
+              "Found a partial pipeline in the cache.  This is "
+              "most likely caused by an incomplete pipeline cache "
+              "import or export");
+
+      /* We're going to have to recompile anyway, so just throw away our
+       * references to the shaders in the cache.  We'll get them out of the
+       * cache again as part of the compilation process.
+       */
+      for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+         stages[s].feedback.flags = 0;
+         if (pipeline->shaders[s]) {
+            anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
+            pipeline->shaders[s] = NULL;
+         }
+      }
+   }
+
+   return false;
+}
+
+static const gl_shader_stage graphics_shader_order[] = {
+   MESA_SHADER_VERTEX,
+   MESA_SHADER_TESS_CTRL,
+   MESA_SHADER_TESS_EVAL,
+   MESA_SHADER_GEOMETRY,
+
+   MESA_SHADER_TASK,
+   MESA_SHADER_MESH,
+
+   MESA_SHADER_FRAGMENT,
+};
+
+static VkResult
+anv_graphics_pipeline_load_nir(struct anv_graphics_pipeline *pipeline,
+                               struct vk_pipeline_cache *cache,
+                               struct anv_pipeline_stage *stages,
+                               void *pipeline_ctx)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].info)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      assert(stages[s].stage == s);
+      assert(pipeline->shaders[s] == NULL);
+
+      stages[s].bind_map = (struct anv_pipeline_bind_map) {
+         .surface_to_descriptor = stages[s].surface_to_descriptor,
+         .sampler_to_descriptor = stages[s].sampler_to_descriptor
+      };
+
+      stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
+                                                 pipeline_ctx,
+                                                 &stages[s]);
+      if (stages[s].nir == NULL) {
+         return vk_error(pipeline, VK_ERROR_UNKNOWN);
+      }
+
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_graphics_pipeline_compile(struct anv_graphics_pipeline *pipeline,
+                              struct vk_pipeline_cache *cache,
+                              const VkGraphicsPipelineCreateInfo *info,
+                              const struct vk_graphics_pipeline_state *state)
+{
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+   VkResult result;
+
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
+   const struct brw_compiler *compiler = pipeline->base.device->physical->compiler;
+   struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      gl_shader_stage stage = vk_to_mesa_shader_stage(info->pStages[i].stage);
+      stages[stage].stage = stage;
+      stages[stage].info = &info->pStages[i];
+   }
+
+   anv_graphics_pipeline_init_keys(pipeline, state, stages);
+
+   unsigned char sha1[20];
+   anv_pipeline_hash_graphics(pipeline, layout, stages, sha1);
+
+   for (unsigned s = 0; s < ARRAY_SIZE(stages); s++) {
+      if (!stages[s].info)
+         continue;
+
+      stages[s].cache_key.stage = s;
+      memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1));
+   }
+
+   const bool skip_cache_lookup =
+      (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+   if (!skip_cache_lookup) {
+      bool found_all_shaders =
+         anv_graphics_pipeline_load_cached_shaders(pipeline, cache, stages,
+                                                   &pipeline_feedback);
+      if (found_all_shaders)
+         goto done;
+   }
+
+   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
+      return VK_PIPELINE_COMPILE_REQUIRED;
+
+   void *pipeline_ctx = ralloc_context(NULL);
+
+   result = anv_graphics_pipeline_load_nir(pipeline, cache, stages,
+                                           pipeline_ctx);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   /* Walk backwards to link */
+   struct anv_pipeline_stage *next_stage = NULL;
+   for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].info)
+         continue;
+
+      switch (s) {
+      case MESA_SHADER_VERTEX:
+         anv_pipeline_link_vs(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         anv_pipeline_link_tcs(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         anv_pipeline_link_tes(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_GEOMETRY:
+         anv_pipeline_link_gs(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_TASK:
+         anv_pipeline_link_task(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_MESH:
+         anv_pipeline_link_mesh(compiler, &stages[s], next_stage);
+         break;
+      case MESA_SHADER_FRAGMENT:
+         anv_pipeline_link_fs(compiler, &stages[s], state->rp);
+         break;
+      default:
+         unreachable("Invalid graphics shader stage");
+      }
+
+      next_stage = &stages[s];
+   }
+
+   bool use_primitive_replication = false;
+   if (pipeline->base.device->info->ver >= 12 &&
+       pipeline->view_mask != 0) {
+      /* For some pipelines HW Primitive Replication can be used instead of
+       * instancing to implement Multiview.  This depend on how viewIndex is
+       * used in all the active shaders, so this check can't be done per
+       * individual shaders.
+       */
+      nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+      for (unsigned s = 0; s < ARRAY_SIZE(shaders); s++)
+         shaders[s] = stages[s].nir;
+
+      use_primitive_replication =
+         anv_check_for_primitive_replication(pipeline->base.device,
+                                             pipeline->active_stages,
+                                             shaders, pipeline->view_mask);
+   }
+
+   struct anv_pipeline_stage *prev_stage = NULL;
+   for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].info)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      void *stage_ctx = ralloc_context(NULL);
+
+      anv_pipeline_lower_nir(&pipeline->base, stage_ctx, &stages[s], layout,
+                             use_primitive_replication);
+
+      if (prev_stage && compiler->nir_options[s]->unify_interfaces) {
+         prev_stage->nir->info.outputs_written |= stages[s].nir->info.inputs_read &
+                  ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+         stages[s].nir->info.inputs_read |= prev_stage->nir->info.outputs_written &
+                  ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+         prev_stage->nir->info.patch_outputs_written |= stages[s].nir->info.patch_inputs_read;
+         stages[s].nir->info.patch_inputs_read |= prev_stage->nir->info.patch_outputs_written;
+      }
+
+      ralloc_free(stage_ctx);
+
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+
+      prev_stage = &stages[s];
+   }
+
+   /* In the case the platform can write the primitive variable shading rate,
+    * figure out the last geometry stage that should write the primitive
+    * shading rate, and ensure it is marked as used there. The backend will
+    * write a default value if the shader doesn't actually write it.
+    *
+    * We iterate backwards in the stage and stop on the first shader that can
+    * set the value.
+    */
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   if (devinfo->has_coarse_pixel_primitive_and_cb &&
+       stages[MESA_SHADER_FRAGMENT].info &&
+       stages[MESA_SHADER_FRAGMENT].key.wm.coarse_pixel &&
+       !stages[MESA_SHADER_FRAGMENT].nir->info.fs.uses_sample_shading &&
+       stages[MESA_SHADER_MESH].info == NULL) {
+      struct anv_pipeline_stage *last_psr = NULL;
+
+      for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+         gl_shader_stage s =
+            graphics_shader_order[ARRAY_SIZE(graphics_shader_order) - i - 1];
+
+         if (!stages[s].info ||
+             !gl_shader_stage_can_set_fragment_shading_rate(s))
+            continue;
+
+         last_psr = &stages[s];
+         break;
+      }
+
+      assert(last_psr);
+      last_psr->nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_SHADING_RATE;
+   }
+
+   prev_stage = NULL;
+   for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (!stages[s].info)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      void *stage_ctx = ralloc_context(NULL);
+
+      switch (s) {
+      case MESA_SHADER_VERTEX:
+         anv_pipeline_compile_vs(compiler, stage_ctx, pipeline,
+                                 &stages[s]);
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         anv_pipeline_compile_tcs(compiler, stage_ctx, pipeline->base.device,
+                                  &stages[s], prev_stage);
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         anv_pipeline_compile_tes(compiler, stage_ctx, pipeline->base.device,
+                                  &stages[s], prev_stage);
+         break;
+      case MESA_SHADER_GEOMETRY:
+         anv_pipeline_compile_gs(compiler, stage_ctx, pipeline->base.device,
+                                 &stages[s], prev_stage);
+         break;
+      case MESA_SHADER_TASK:
+         anv_pipeline_compile_task(compiler, stage_ctx, pipeline->base.device,
+                                   &stages[s]);
+         break;
+      case MESA_SHADER_MESH:
+         anv_pipeline_compile_mesh(compiler, stage_ctx, pipeline->base.device,
+                                   &stages[s], prev_stage);
+         break;
+      case MESA_SHADER_FRAGMENT:
+         anv_pipeline_compile_fs(compiler, stage_ctx, pipeline->base.device,
+                                 &stages[s], prev_stage);
+         break;
+      default:
+         unreachable("Invalid graphics shader stage");
+      }
+      if (stages[s].code == NULL) {
+         ralloc_free(stage_ctx);
+         result = vk_error(pipeline->base.device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+
+      anv_nir_validate_push_layout(&stages[s].prog_data.base,
+                                   &stages[s].bind_map);
+
+      struct anv_shader_bin *bin =
+         anv_device_upload_kernel(pipeline->base.device, cache, s,
+                                  &stages[s].cache_key,
+                                  sizeof(stages[s].cache_key),
+                                  stages[s].code,
+                                  stages[s].prog_data.base.program_size,
+                                  &stages[s].prog_data.base,
+                                  brw_prog_data_size(s),
+                                  stages[s].stats, stages[s].num_stats,
+                                  stages[s].nir->xfb_info,
+                                  &stages[s].bind_map);
+      if (!bin) {
+         ralloc_free(stage_ctx);
+         result = vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail;
+      }
+
+      anv_pipeline_add_executables(&pipeline->base, &stages[s], bin);
+
+      pipeline->shaders[s] = bin;
+      ralloc_free(stage_ctx);
+
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+
+      prev_stage = &stages[s];
+   }
+
+   ralloc_free(pipeline_ctx);
+
+done:
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+   if (create_feedback) {
+      *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+
+      assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
+      for (uint32_t i = 0; i < info->stageCount; i++) {
+         gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
+         create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
+      }
+   }
+
+   return VK_SUCCESS;
+
+fail:
+   ralloc_free(pipeline_ctx);
+
+   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+      if (pipeline->shaders[s])
+         anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
+   }
+
+   return result;
+}
+
+static VkResult
+anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
+                        struct vk_pipeline_cache *cache,
+                        const VkComputePipelineCreateInfo *info)
+{
+   const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
+   assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT);
+
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
+   struct anv_device *device = pipeline->base.device;
+   const struct brw_compiler *compiler = device->physical->compiler;
+
+   struct anv_pipeline_stage stage = {
+      .stage = MESA_SHADER_COMPUTE,
+      .info = &info->stage,
+      .cache_key = {
+         .stage = MESA_SHADER_COMPUTE,
+      },
+      .feedback = {
+         .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+      },
+   };
+   vk_pipeline_hash_shader_stage(&info->stage, stage.shader_sha1);
+
+   struct anv_shader_bin *bin = NULL;
+
+   populate_cs_prog_key(device, device->robust_buffer_access, &stage.key.cs);
+
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
+   const bool skip_cache_lookup =
+      (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+
+   anv_pipeline_hash_compute(pipeline, layout, &stage, stage.cache_key.sha1);
+
+   bool cache_hit = false;
+   if (!skip_cache_lookup) {
+      bin = anv_device_search_for_kernel(device, cache,
+                                         &stage.cache_key,
+                                         sizeof(stage.cache_key),
+                                         &cache_hit);
+   }
+
+   if (bin == NULL &&
+       (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT))
+      return VK_PIPELINE_COMPILE_REQUIRED;
+
+   void *mem_ctx = ralloc_context(NULL);
+   if (bin == NULL) {
+      int64_t stage_start = os_time_get_nano();
+
+      stage.bind_map = (struct anv_pipeline_bind_map) {
+         .surface_to_descriptor = stage.surface_to_descriptor,
+         .sampler_to_descriptor = stage.sampler_to_descriptor
+      };
+
+      /* Set up a binding for the gl_NumWorkGroups */
+      stage.bind_map.surface_count = 1;
+      stage.bind_map.surface_to_descriptor[0] = (struct anv_pipeline_binding) {
+         .set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS,
+      };
+
+      stage.nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, mem_ctx, &stage);
+      if (stage.nir == NULL) {
+         ralloc_free(mem_ctx);
+         return vk_error(pipeline, VK_ERROR_UNKNOWN);
+      }
+
+      NIR_PASS(_, stage.nir, anv_nir_add_base_work_group_id);
+
+      anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage, layout,
+                             false /* use_primitive_replication */);
+
+      unsigned local_size = stage.nir->info.workgroup_size[0] *
+                            stage.nir->info.workgroup_size[1] *
+                            stage.nir->info.workgroup_size[2];
+
+      /* Games don't always request full subgroups when they should,
+       * which can cause bugs, as they may expect bigger size of the
+       * subgroup than we choose for the execution.
+       */
+      if (device->physical->instance->assume_full_subgroups &&
+          stage.nir->info.cs.uses_wide_subgroup_intrinsics &&
+          stage.nir->info.subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
+          local_size &&
+          local_size % BRW_SUBGROUP_SIZE == 0)
+         stage.nir->info.subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
+      /* If the client requests that we dispatch full subgroups but doesn't
+       * allow us to pick a subgroup size, we have to smash it to the API
+       * value of 32.  Performance will likely be terrible in this case but
+       * there's nothing we can do about that.  The client should have chosen
+       * a size.
+       */
+      if (stage.nir->info.subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
+         stage.nir->info.subgroup_size = BRW_SUBGROUP_SIZE;
+
+      stage.num_stats = 1;
+
+      struct brw_compile_cs_params params = {
+         .nir = stage.nir,
+         .key = &stage.key.cs,
+         .prog_data = &stage.prog_data.cs,
+         .stats = stage.stats,
+         .log_data = device,
+      };
+
+      stage.code = brw_compile_cs(compiler, mem_ctx, &params);
+      if (stage.code == NULL) {
+         ralloc_free(mem_ctx);
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map);
+
+      if (!stage.prog_data.cs.uses_num_work_groups) {
+         assert(stage.bind_map.surface_to_descriptor[0].set ==
+                ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS);
+         stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL;
+      }
+
+      const unsigned code_size = stage.prog_data.base.program_size;
+      bin = anv_device_upload_kernel(device, cache,
+                                     MESA_SHADER_COMPUTE,
+                                     &stage.cache_key, sizeof(stage.cache_key),
+                                     stage.code, code_size,
+                                     &stage.prog_data.base,
+                                     sizeof(stage.prog_data.cs),
+                                     stage.stats, stage.num_stats,
+                                     NULL, &stage.bind_map);
+      if (!bin) {
+         ralloc_free(mem_ctx);
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      stage.feedback.duration = os_time_get_nano() - stage_start;
+   }
+
+   anv_pipeline_add_executables(&pipeline->base, &stage, bin);
+
+   ralloc_free(mem_ctx);
+
+   if (cache_hit) {
+      stage.feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      pipeline_feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+   }
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+   if (create_feedback) {
+      *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+
+      assert(create_feedback->pipelineStageCreationFeedbackCount == 1);
+      create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback;
+   }
+
+   pipeline->cs = bin;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_compute_pipeline_create(struct anv_device *device,
+                            struct vk_pipeline_cache *cache,
+                            const VkComputePipelineCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator,
+                            VkPipeline *pPipeline)
+{
+   struct anv_compute_pipeline *pipeline;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
+
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(&pipeline->base, device,
+                              ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
+                         pipeline->batch_data, sizeof(pipeline->batch_data));
+
+   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base, device, pAllocator);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   anv_genX(device->info, compute_pipeline_emit)(pipeline);
+
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+
+   return pipeline->base.batch.status;
+}
+
+VkResult anv_CreateComputePipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkComputePipelineCreateInfo*          pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+   VkResult result = VK_SUCCESS;
+
+   unsigned i;
+   for (i = 0; i < count; i++) {
+      VkResult res = anv_compute_pipeline_create(device, pipeline_cache,
+                                                 &pCreateInfos[i],
+                                                 pAllocator, &pPipelines[i]);
+
+      if (res == VK_SUCCESS)
+         continue;
+
+      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+       * is not obvious what error should be report upon 2 different failures.
+       * */
+      result = res;
+      if (res != VK_PIPELINE_COMPILE_REQUIRED)
+         break;
+
+      pPipelines[i] = VK_NULL_HANDLE;
+
+      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
+         break;
+   }
+
+   for (; i < count; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
+   return result;
+}
+
+/**
+ * Calculate the desired L3 partitioning based on the current state of the
+ * pipeline.  For now this simply returns the conservative defaults calculated
+ * by get_default_l3_weights(), but we could probably do better by gathering
+ * more statistics from the pipeline state (e.g. guess of expected URB usage
+ * and bound surfaces), or by using feed-back from performance counters.
+ */
+void
+anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm)
+{
+   const struct intel_device_info *devinfo = pipeline->device->info;
+
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(devinfo, true, needs_slm);
+
+   pipeline->l3_config = intel_get_l3_config(devinfo, w);
+}
+
+static VkResult
+anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline,
+                           struct anv_device *device,
+                           struct vk_pipeline_cache *cache,
+                           const struct VkGraphicsPipelineCreateInfo *pCreateInfo,
+                           const struct vk_graphics_pipeline_state *state,
+                           const VkAllocationCallbacks *alloc)
+{
+   VkResult result;
+
+   result = anv_pipeline_init(&pipeline->base, device,
+                              ANV_PIPELINE_GRAPHICS, pCreateInfo->flags,
+                              alloc);
+   if (result != VK_SUCCESS)
+      return result;
+
+   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
+                         pipeline->batch_data, sizeof(pipeline->batch_data));
+
+   pipeline->active_stages = 0;
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
+      pipeline->active_stages |= pCreateInfo->pStages[i].stage;
+
+   if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
+      pipeline->active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+
+   if (anv_pipeline_is_mesh(pipeline))
+      assert(device->physical->vk.supported_extensions.NV_mesh_shader);
+
+   pipeline->dynamic_state.ms.sample_locations = &pipeline->sample_locations;
+   vk_dynamic_graphics_state_fill(&pipeline->dynamic_state, state);
+
+   pipeline->depth_clamp_enable = state->rs->depth_clamp_enable;
+   pipeline->depth_clip_enable = state->rs->depth_clip_enable;
+   pipeline->view_mask = state->rp->view_mask;
+
+   result = anv_graphics_pipeline_compile(pipeline, cache, pCreateInfo, state);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base, device, alloc);
+      return result;
+   }
+
+   anv_pipeline_setup_l3_config(&pipeline->base, false);
+
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const uint64_t inputs_read = get_vs_prog_data(pipeline)->inputs_read;
+
+      u_foreach_bit(a, state->vi->attributes_valid) {
+         if (inputs_read & BITFIELD64_BIT(VERT_ATTRIB_GENERIC0 + a))
+            pipeline->vb_used |= BITFIELD64_BIT(state->vi->attributes[a].binding);
+      }
+
+      u_foreach_bit(b, state->vi->bindings_valid) {
+         pipeline->vb[b].stride = state->vi->bindings[b].stride;
+         pipeline->vb[b].instanced = state->vi->bindings[b].input_rate ==
+                                      VK_VERTEX_INPUT_RATE_INSTANCE;
+         pipeline->vb[b].instance_divisor = state->vi->bindings[b].divisor;
+      }
+
+      /* Our implementation of VK_KHR_multiview uses instancing to draw the
+       * different views when primitive replication cannot be used.  If the
+       * client asks for instancing, we need to multiply by the client's
+       * instance count at draw time and instance divisor in the vertex
+       * bindings by the number of views ensure that we repeat the client's
+       * per-instance data once for each view.
+       */
+      const bool uses_primitive_replication =
+         anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots > 1;
+      pipeline->instance_multiplier = 1;
+      if (pipeline->view_mask && !uses_primitive_replication)
+         pipeline->instance_multiplier = util_bitcount(pipeline->view_mask);
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
+      /* TODO(mesh): Mesh vs. Multiview with Instancing. */
+   }
+
+   pipeline->negative_one_to_one =
+      state->vp != NULL && state->vp->negative_one_to_one;
+
+   /* Store line mode, polygon mode and rasterization samples, these are used
+    * for dynamic primitive topology.
+    */
+   pipeline->polygon_mode = state->rs->polygon_mode;
+   pipeline->rasterization_samples =
+      state->ms != NULL ? state->ms->rasterization_samples : 1;
+   pipeline->line_mode = state->rs->line.mode;
+   if (pipeline->line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT) {
+      if (pipeline->rasterization_samples > 1) {
+         pipeline->line_mode = VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
+      } else {
+         pipeline->line_mode = VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
+      }
+   }
+   pipeline->patch_control_points =
+      state->ts != NULL ? state->ts->patch_control_points : 0;
+
+   /* Store the color write masks, to be merged with color write enable if
+    * dynamic.
+    */
+   if (state->cb != NULL) {
+      for (unsigned i = 0; i < state->cb->attachment_count; i++)
+         pipeline->color_comp_writes[i] = state->cb->attachments[i].write_mask;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_graphics_pipeline_create(struct anv_device *device,
+                             struct vk_pipeline_cache *cache,
+                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *pAllocator,
+                             VkPipeline *pPipeline)
+{
+   struct anv_graphics_pipeline *pipeline;
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct vk_graphics_pipeline_all_state all;
+   struct vk_graphics_pipeline_state state = { };
+   result = vk_graphics_pipeline_state_fill(&device->vk, &state, pCreateInfo,
+                                            NULL /* sp_info */,
+                                            &all, NULL, 0, NULL);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   result = anv_graphics_pipeline_init(pipeline, device, cache,
+                                       pCreateInfo, &state, pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   anv_genX(device->info, graphics_pipeline_emit)(pipeline, &state);
+
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+
+   return pipeline->base.batch.status;
+}
+
+VkResult anv_CreateGraphicsPipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+   VkResult result = VK_SUCCESS;
+
+   unsigned i;
+   for (i = 0; i < count; i++) {
+      VkResult res = anv_graphics_pipeline_create(device,
+                                                  pipeline_cache,
+                                                  &pCreateInfos[i],
+                                                  pAllocator, &pPipelines[i]);
+
+      if (res == VK_SUCCESS)
+         continue;
+
+      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+       * is not obvious what error should be report upon 2 different failures.
+       * */
+      result = res;
+      if (res != VK_PIPELINE_COMPILE_REQUIRED)
+         break;
+
+      pPipelines[i] = VK_NULL_HANDLE;
+
+      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
+         break;
+   }
+
+   for (; i < count; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
+   return result;
+}
+
+static VkResult
+compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
+                         struct vk_pipeline_cache *cache,
+                         nir_shader *nir,
+                         struct anv_pipeline_stage *stage,
+                         struct anv_shader_bin **shader_out,
+                         void *mem_ctx)
+{
+   const struct brw_compiler *compiler =
+      pipeline->base.device->physical->compiler;
+   const struct intel_device_info *devinfo = compiler->devinfo;
+
+   nir_shader **resume_shaders = NULL;
+   uint32_t num_resume_shaders = 0;
+   if (nir->info.stage != MESA_SHADER_COMPUTE) {
+      NIR_PASS(_, nir, nir_lower_shader_calls,
+               nir_address_format_64bit_global,
+               BRW_BTD_STACK_ALIGN,
+               &resume_shaders, &num_resume_shaders, mem_ctx);
+      NIR_PASS(_, nir, brw_nir_lower_shader_calls);
+      NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+   }
+
+   for (unsigned i = 0; i < num_resume_shaders; i++) {
+      NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls);
+      NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
+   }
+
+   struct brw_compile_bs_params params = {
+      .nir = nir,
+      .key = &stage->key.bs,
+      .prog_data = &stage->prog_data.bs,
+      .num_resume_shaders = num_resume_shaders,
+      .resume_shaders = resume_shaders,
+
+      .stats = stage->stats,
+      .log_data = pipeline->base.device,
+   };
+
+   stage->code = brw_compile_bs(compiler, mem_ctx, &params);
+   if (stage->code == NULL)
+      return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Ray-tracing shaders don't have a "real" bind map */
+   struct anv_pipeline_bind_map empty_bind_map = {};
+
+   const unsigned code_size = stage->prog_data.base.program_size;
+   struct anv_shader_bin *bin =
+      anv_device_upload_kernel(pipeline->base.device,
+                               cache,
+                               stage->stage,
+                               &stage->cache_key, sizeof(stage->cache_key),
+                               stage->code, code_size,
+                               &stage->prog_data.base,
+                               sizeof(stage->prog_data.bs),
+                               stage->stats, 1,
+                               NULL, &empty_bind_map);
+   if (bin == NULL)
+      return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* TODO: Figure out executables for resume shaders */
+   anv_pipeline_add_executables(&pipeline->base, stage, bin);
+   util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, bin);
+
+   *shader_out = bin;
+
+   return VK_SUCCESS;
+}
+
+static bool
+is_rt_stack_size_dynamic(const VkRayTracingPipelineCreateInfoKHR *info)
+{
+   if (info->pDynamicState == NULL)
+      return false;
+
+   for (unsigned i = 0; i < info->pDynamicState->dynamicStateCount; i++) {
+      if (info->pDynamicState->pDynamicStates[i] ==
+          VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR)
+         return true;
+   }
+
+   return false;
+}
+
+static void
+anv_pipeline_compute_ray_tracing_stacks(struct anv_ray_tracing_pipeline *pipeline,
+                                        const VkRayTracingPipelineCreateInfoKHR *info,
+                                        uint32_t *stack_max)
+{
+   if (is_rt_stack_size_dynamic(info)) {
+      pipeline->stack_size = 0; /* 0 means dynamic */
+   } else {
+      /* From the Vulkan spec:
+       *
+       *    "If the stack size is not set explicitly, the stack size for a
+       *    pipeline is:
+       *
+       *       rayGenStackMax +
+       *       min(1, maxPipelineRayRecursionDepth) ×
+       *       max(closestHitStackMax, missStackMax,
+       *           intersectionStackMax + anyHitStackMax) +
+       *       max(0, maxPipelineRayRecursionDepth-1) ×
+       *       max(closestHitStackMax, missStackMax) +
+       *       2 × callableStackMax"
+       */
+      pipeline->stack_size =
+         stack_max[MESA_SHADER_RAYGEN] +
+         MIN2(1, info->maxPipelineRayRecursionDepth) *
+         MAX4(stack_max[MESA_SHADER_CLOSEST_HIT],
+              stack_max[MESA_SHADER_MISS],
+              stack_max[MESA_SHADER_INTERSECTION],
+              stack_max[MESA_SHADER_ANY_HIT]) +
+         MAX2(0, (int)info->maxPipelineRayRecursionDepth - 1) *
+         MAX2(stack_max[MESA_SHADER_CLOSEST_HIT],
+              stack_max[MESA_SHADER_MISS]) +
+         2 * stack_max[MESA_SHADER_CALLABLE];
+
+      /* This is an extremely unlikely case but we need to set it to some
+       * non-zero value so that we don't accidentally think it's dynamic.
+       * Our minimum stack size is 2KB anyway so we could set to any small
+       * value we like.
+       */
+      if (pipeline->stack_size == 0)
+         pipeline->stack_size = 1;
+   }
+}
+
+static struct anv_pipeline_stage *
+anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
+                                     const VkRayTracingPipelineCreateInfoKHR *info,
+                                     void *pipeline_ctx)
+{
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
+   /* Create enough stage entries for all shader modules plus potential
+    * combinaisons in the groups.
+    */
+   struct anv_pipeline_stage *stages =
+      rzalloc_array(pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
+      if (vk_pipeline_shader_stage_is_null(sinfo))
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      stages[i] = (struct anv_pipeline_stage) {
+         .stage = vk_to_mesa_shader_stage(sinfo->stage),
+         .info = sinfo,
+         .cache_key = {
+            .stage = vk_to_mesa_shader_stage(sinfo->stage),
+         },
+         .feedback = {
+            .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+         },
+      };
+
+      populate_bs_prog_key(pipeline->base.device,
+                           pipeline->base.device->robust_buffer_access,
+                           &stages[i].key.bs);
+
+      vk_pipeline_hash_shader_stage(sinfo, stages[i].shader_sha1);
+
+      if (stages[i].stage != MESA_SHADER_INTERSECTION) {
+         anv_pipeline_hash_ray_tracing_shader(pipeline, layout, &stages[i],
+                                              stages[i].cache_key.sha1);
+      }
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   for (uint32_t i = 0; i < info->groupCount; i++) {
+      const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i];
+
+      if (ginfo->type != VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      uint32_t intersection_idx = ginfo->intersectionShader;
+      assert(intersection_idx < info->stageCount);
+
+      uint32_t any_hit_idx = ginfo->anyHitShader;
+      if (any_hit_idx != VK_SHADER_UNUSED_KHR) {
+         assert(any_hit_idx < info->stageCount);
+         anv_pipeline_hash_ray_tracing_combined_shader(pipeline,
+                                                       layout,
+                                                       &stages[intersection_idx],
+                                                       &stages[any_hit_idx],
+                                                       stages[intersection_idx].cache_key.sha1);
+      } else {
+         anv_pipeline_hash_ray_tracing_shader(pipeline, layout,
+                                              &stages[intersection_idx],
+                                              stages[intersection_idx].cache_key.sha1);
+      }
+
+      stages[intersection_idx].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   return stages;
+}
+
+static bool
+anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
+                                 struct vk_pipeline_cache *cache,
+                                 const VkRayTracingPipelineCreateInfoKHR *info,
+                                 struct anv_pipeline_stage *stages,
+                                 uint32_t *stack_max)
+{
+   uint32_t shaders = 0, cache_hits = 0;
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      if (stages[i].info == NULL)
+         continue;
+
+      shaders++;
+
+      int64_t stage_start = os_time_get_nano();
+
+      bool cache_hit;
+      stages[i].bin = anv_device_search_for_kernel(pipeline->base.device, cache,
+                                                   &stages[i].cache_key,
+                                                   sizeof(stages[i].cache_key),
+                                                   &cache_hit);
+      if (cache_hit) {
+         cache_hits++;
+         stages[i].feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      }
+
+      if (stages[i].bin != NULL) {
+         anv_pipeline_add_executables(&pipeline->base, &stages[i], stages[i].bin);
+         util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, stages[i].bin);
+
+         uint32_t stack_size =
+            brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
+         stack_max[stages[i].stage] =
+            MAX2(stack_max[stages[i].stage], stack_size);
+      }
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   return cache_hits == shaders;
+}
+
+static VkResult
+anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
+                                 struct vk_pipeline_cache *cache,
+                                 const VkRayTracingPipelineCreateInfoKHR *info)
+{
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   VkResult result;
+
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
+   void *pipeline_ctx = ralloc_context(NULL);
+
+   struct anv_pipeline_stage *stages =
+      anv_pipeline_init_ray_tracing_stages(pipeline, info, pipeline_ctx);
+
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+
+   const bool skip_cache_lookup =
+      (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+
+   uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {};
+
+   if (!skip_cache_lookup &&
+       anv_pipeline_load_cached_shaders(pipeline, cache, info, stages, stack_max)) {
+      pipeline_feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      goto done;
+   }
+
+   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
+      ralloc_free(pipeline_ctx);
+      return VK_PIPELINE_COMPILE_REQUIRED;
+   }
+
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      if (stages[i].info == NULL)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      stages[i].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
+                                                 pipeline_ctx, &stages[i]);
+      if (stages[i].nir == NULL) {
+         ralloc_free(pipeline_ctx);
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      anv_pipeline_lower_nir(&pipeline->base, pipeline_ctx, &stages[i],
+                             layout, false /* use_primitive_replication */);
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      if (stages[i].info == NULL)
+         continue;
+
+      /* Shader found in cache already. */
+      if (stages[i].bin != NULL)
+         continue;
+
+      /* We handle intersection shaders as part of the group */
+      if (stages[i].stage == MESA_SHADER_INTERSECTION)
+         continue;
+
+      int64_t stage_start = os_time_get_nano();
+
+      void *stage_ctx = ralloc_context(pipeline_ctx);
+
+      nir_shader *nir = nir_shader_clone(stage_ctx, stages[i].nir);
+      switch (stages[i].stage) {
+      case MESA_SHADER_RAYGEN:
+         brw_nir_lower_raygen(nir);
+         break;
+
+      case MESA_SHADER_ANY_HIT:
+         brw_nir_lower_any_hit(nir, devinfo);
+         break;
+
+      case MESA_SHADER_CLOSEST_HIT:
+         brw_nir_lower_closest_hit(nir);
+         break;
+
+      case MESA_SHADER_MISS:
+         brw_nir_lower_miss(nir);
+         break;
+
+      case MESA_SHADER_INTERSECTION:
+         unreachable("These are handled later");
+
+      case MESA_SHADER_CALLABLE:
+         brw_nir_lower_callable(nir);
+         break;
+
+      default:
+         unreachable("Invalid ray-tracing shader stage");
+      }
+
+      result = compile_upload_rt_shader(pipeline, cache, nir, &stages[i],
+                                        &stages[i].bin, stage_ctx);
+      if (result != VK_SUCCESS) {
+         ralloc_free(pipeline_ctx);
+         return result;
+      }
+
+      uint32_t stack_size =
+         brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
+      stack_max[stages[i].stage] = MAX2(stack_max[stages[i].stage], stack_size);
+
+      ralloc_free(stage_ctx);
+
+      stages[i].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   for (uint32_t i = 0; i < info->groupCount; i++) {
+      const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i];
+      struct anv_rt_shader_group *group = &pipeline->groups[i];
+      group->type = ginfo->type;
+      switch (ginfo->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         assert(ginfo->generalShader < info->stageCount);
+         group->general = stages[ginfo->generalShader].bin;
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         if (ginfo->anyHitShader < info->stageCount)
+            group->any_hit = stages[ginfo->anyHitShader].bin;
+
+         if (ginfo->closestHitShader < info->stageCount)
+            group->closest_hit = stages[ginfo->closestHitShader].bin;
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
+         if (ginfo->closestHitShader < info->stageCount)
+            group->closest_hit = stages[ginfo->closestHitShader].bin;
+
+         uint32_t intersection_idx = info->pGroups[i].intersectionShader;
+         assert(intersection_idx < info->stageCount);
+
+         /* Only compile this stage if not already found in the cache. */
+         if (stages[intersection_idx].bin == NULL) {
+            /* The any-hit and intersection shader have to be combined */
+            uint32_t any_hit_idx = info->pGroups[i].anyHitShader;
+            const nir_shader *any_hit = NULL;
+            if (any_hit_idx < info->stageCount)
+               any_hit = stages[any_hit_idx].nir;
+
+            void *group_ctx = ralloc_context(pipeline_ctx);
+            nir_shader *intersection =
+               nir_shader_clone(group_ctx, stages[intersection_idx].nir);
+
+            brw_nir_lower_combined_intersection_any_hit(intersection, any_hit,
+                                                        devinfo);
+
+            result = compile_upload_rt_shader(pipeline, cache,
+                                              intersection,
+                                              &stages[intersection_idx],
+                                              &group->intersection,
+                                              group_ctx);
+            ralloc_free(group_ctx);
+            if (result != VK_SUCCESS)
+               return result;
+         } else {
+            group->intersection = stages[intersection_idx].bin;
+         }
+
+         uint32_t stack_size =
+            brw_bs_prog_data_const(group->intersection->prog_data)->max_stack_size;
+         stack_max[MESA_SHADER_INTERSECTION] =
+            MAX2(stack_max[MESA_SHADER_INTERSECTION], stack_size);
+
+         break;
+      }
+
+      default:
+         unreachable("Invalid ray tracing shader group type");
+      }
+   }
+
+ done:
+   ralloc_free(pipeline_ctx);
+
+   anv_pipeline_compute_ray_tracing_stacks(pipeline, info, stack_max);
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+   if (create_feedback) {
+      *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+
+      assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
+      for (uint32_t i = 0; i < info->stageCount; i++) {
+         gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
+         create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_init_rt_shaders(struct anv_device *device)
+{
+   if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
+      return VK_SUCCESS;
+
+   bool cache_hit;
+
+   struct brw_rt_trampoline {
+      char name[16];
+      struct brw_cs_prog_key key;
+   } trampoline_key = {
+      .name = "rt-trampoline",
+   };
+   device->rt_trampoline =
+      anv_device_search_for_kernel(device, device->internal_cache,
+                                   &trampoline_key, sizeof(trampoline_key),
+                                   &cache_hit);
+   if (device->rt_trampoline == NULL) {
+
+      void *tmp_ctx = ralloc_context(NULL);
+      nir_shader *trampoline_nir =
+         brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx);
+
+      trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_8;
+
+      struct anv_pipeline_bind_map bind_map = {
+         .surface_count = 0,
+         .sampler_count = 0,
+      };
+      uint32_t dummy_params[4] = { 0, };
+      struct brw_cs_prog_data trampoline_prog_data = {
+         .base.nr_params = 4,
+         .base.param = dummy_params,
+         .uses_inline_data = true,
+         .uses_btd_stack_ids = true,
+      };
+      struct brw_compile_cs_params params = {
+         .nir = trampoline_nir,
+         .key = &trampoline_key.key,
+         .prog_data = &trampoline_prog_data,
+         .log_data = device,
+      };
+      const unsigned *tramp_data =
+         brw_compile_cs(device->physical->compiler, tmp_ctx, &params);
+
+      device->rt_trampoline =
+         anv_device_upload_kernel(device, device->internal_cache,
+                                  MESA_SHADER_COMPUTE,
+                                  &trampoline_key, sizeof(trampoline_key),
+                                  tramp_data,
+                                  trampoline_prog_data.base.program_size,
+                                  &trampoline_prog_data.base,
+                                  sizeof(trampoline_prog_data),
+                                  NULL, 0, NULL, &bind_map);
+
+      ralloc_free(tmp_ctx);
+
+      if (device->rt_trampoline == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->rt_trampoline);
+
+   struct brw_rt_trivial_return {
+      char name[16];
+      struct brw_bs_prog_key key;
+   } return_key = {
+      .name = "rt-trivial-ret",
+   };
+   device->rt_trivial_return =
+      anv_device_search_for_kernel(device, device->internal_cache,
+                                   &return_key, sizeof(return_key),
+                                   &cache_hit);
+   if (device->rt_trivial_return == NULL) {
+      void *tmp_ctx = ralloc_context(NULL);
+      nir_shader *trivial_return_nir =
+         brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);
+
+      NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
+
+      struct anv_pipeline_bind_map bind_map = {
+         .surface_count = 0,
+         .sampler_count = 0,
+      };
+      struct brw_bs_prog_data return_prog_data = { 0, };
+      struct brw_compile_bs_params params = {
+         .nir = trivial_return_nir,
+         .key = &return_key.key,
+         .prog_data = &return_prog_data,
+
+         .log_data = device,
+      };
+      const unsigned *return_data =
+         brw_compile_bs(device->physical->compiler, tmp_ctx, &params);
+
+      device->rt_trivial_return =
+         anv_device_upload_kernel(device, device->internal_cache,
+                                  MESA_SHADER_CALLABLE,
+                                  &return_key, sizeof(return_key),
+                                  return_data, return_prog_data.base.program_size,
+                                  &return_prog_data.base, sizeof(return_prog_data),
+                                  NULL, 0, NULL, &bind_map);
+
+      ralloc_free(tmp_ctx);
+
+      if (device->rt_trivial_return == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->rt_trivial_return);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_rt_shaders(struct anv_device *device)
+{
+   if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
+      return;
+}
+
+static VkResult
+anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
+                              struct anv_device *device,
+                              struct vk_pipeline_cache *cache,
+                              const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                              const VkAllocationCallbacks *alloc)
+{
+   VkResult result;
+
+   util_dynarray_init(&pipeline->shaders, pipeline->base.mem_ctx);
+
+   result = anv_pipeline_compile_ray_tracing(pipeline, cache, pCreateInfo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   anv_pipeline_setup_l3_config(&pipeline->base, /* needs_slm */ false);
+
+   return VK_SUCCESS;
+
+fail:
+   util_dynarray_foreach(&pipeline->shaders,
+                         struct anv_shader_bin *, shader) {
+      anv_shader_bin_unref(device, *shader);
+   }
+   return result;
+}
+
+static void
+assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
+                            uint32_t stage_idx,
+                            VkShaderStageFlags valid_stages)
+{
+   if (stage_idx == VK_SHADER_UNUSED_KHR)
+      return;
+
+   assert(stage_idx <= pCreateInfo->stageCount);
+   assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
+   assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
+}
+
+static VkResult
+anv_ray_tracing_pipeline_create(
+    VkDevice                                    _device,
+    struct vk_pipeline_cache *                  cache,
+    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
+
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
+   VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
+   if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(&pipeline->base, device,
+                              ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   pipeline->group_count = pCreateInfo->groupCount;
+   pipeline->groups = groups;
+
+   ASSERTED const VkShaderStageFlags ray_tracing_stages =
+      VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+      VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+      VK_SHADER_STAGE_MISS_BIT_KHR |
+      VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+      VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
+      assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
+
+   for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
+      const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
+         &pCreateInfo->pGroups[i];
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
+                                  VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                                  VK_SHADER_STAGE_MISS_BIT_KHR |
+                                  VK_SHADER_STAGE_CALLABLE_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
+                                  VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
+                                  VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
+                                  VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
+      switch (ginfo->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         assert(ginfo->generalShader < pCreateInfo->stageCount);
+         assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      default:
+         unreachable("Invalid ray-tracing shader group type");
+      }
+   }
+
+   result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
+                                          pCreateInfo, pAllocator);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base, device, pAllocator);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   anv_genX(device->info, ray_tracing_pipeline_emit)(pipeline);
+
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+
+   return pipeline->base.batch.status;
+}
+
+VkResult
+anv_CreateRayTracingPipelinesKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+   VkResult result = VK_SUCCESS;
+
+   unsigned i;
+   for (i = 0; i < createInfoCount; i++) {
+      VkResult res = anv_ray_tracing_pipeline_create(_device, pipeline_cache,
+                                                     &pCreateInfos[i],
+                                                     pAllocator, &pPipelines[i]);
+
+      if (res == VK_SUCCESS)
+         continue;
+
+      /* Bail out on the first error as it is not obvious what error should be
+       * report upon 2 different failures. */
+      result = res;
+      if (result != VK_PIPELINE_COMPILE_REQUIRED)
+         break;
+
+      pPipelines[i] = VK_NULL_HANDLE;
+
+      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
+         break;
+   }
+
+   for (; i < createInfoCount; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
+   return result;
+}
+
+#define WRITE_STR(field, ...) ({                               \
+   memset(field, 0, sizeof(field));                            \
+   UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
+   assert(i > 0 && i < sizeof(field));                         \
+})
+
+VkResult anv_GetPipelineExecutablePropertiesKHR(
+    VkDevice                                    device,
+    const VkPipelineInfoKHR*                    pPipelineInfo,
+    uint32_t*                                   pExecutableCount,
+    VkPipelineExecutablePropertiesKHR*          pProperties)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pPipelineInfo->pipeline);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
+                          pProperties, pExecutableCount);
+
+   util_dynarray_foreach (&pipeline->executables, struct anv_pipeline_executable, exe) {
+      vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
+         gl_shader_stage stage = exe->stage;
+         props->stages = mesa_to_vk_shader_stage(stage);
+
+         unsigned simd_width = exe->stats.dispatch_width;
+         if (stage == MESA_SHADER_FRAGMENT) {
+            WRITE_STR(props->name, "%s%d %s",
+                      simd_width ? "SIMD" : "vec",
+                      simd_width ? simd_width : 4,
+                      _mesa_shader_stage_to_string(stage));
+         } else {
+            WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(stage));
+         }
+         WRITE_STR(props->description, "%s%d %s shader",
+                   simd_width ? "SIMD" : "vec",
+                   simd_width ? simd_width : 4,
+                   _mesa_shader_stage_to_string(stage));
+
+         /* The compiler gives us a dispatch width of 0 for vec4 but Vulkan
+          * wants a subgroup size of 1.
+          */
+         props->subgroupSize = MAX2(simd_width, 1);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static const struct anv_pipeline_executable *
+anv_pipeline_get_executable(struct anv_pipeline *pipeline, uint32_t index)
+{
+   assert(index < util_dynarray_num_elements(&pipeline->executables,
+                                             struct anv_pipeline_executable));
+   return util_dynarray_element(
+      &pipeline->executables, struct anv_pipeline_executable, index);
+}
+
+VkResult anv_GetPipelineExecutableStatisticsKHR(
+    VkDevice                                    device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pStatisticCount,
+    VkPipelineExecutableStatisticKHR*           pStatistics)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
+                          pStatistics, pStatisticCount);
+
+   const struct anv_pipeline_executable *exe =
+      anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+   const struct brw_stage_prog_data *prog_data;
+   switch (pipeline->type) {
+   case ANV_PIPELINE_GRAPHICS: {
+      prog_data = anv_pipeline_to_graphics(pipeline)->shaders[exe->stage]->prog_data;
+      break;
+   }
+   case ANV_PIPELINE_COMPUTE: {
+      prog_data = anv_pipeline_to_compute(pipeline)->cs->prog_data;
+      break;
+   }
+   case ANV_PIPELINE_RAY_TRACING: {
+      struct anv_shader_bin **shader =
+         util_dynarray_element(&anv_pipeline_to_ray_tracing(pipeline)->shaders,
+                               struct anv_shader_bin *,
+                               pExecutableInfo->executableIndex);
+      prog_data = (*shader)->prog_data;
+      break;
+   }
+   default:
+      unreachable("invalid pipeline type");
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Instruction Count");
+      WRITE_STR(stat->description,
+                "Number of GEN instructions in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.instructions;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "SEND Count");
+      WRITE_STR(stat->description,
+                "Number of instructions in the final generated shader "
+                "executable which access external units such as the "
+                "constant cache or the sampler.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.sends;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Loop Count");
+      WRITE_STR(stat->description,
+                "Number of loops (not unrolled) in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.loops;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Cycle Count");
+      WRITE_STR(stat->description,
+                "Estimate of the number of EU cycles required to execute "
+                "the final generated executable.  This is an estimate only "
+                "and may vary greatly from actual run-time performance.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.cycles;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Spill Count");
+      WRITE_STR(stat->description,
+                "Number of scratch spill operations.  This gives a rough "
+                "estimate of the cost incurred due to spilling temporary "
+                "values to memory.  If this is non-zero, you may want to "
+                "adjust your shader to reduce register pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.spills;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Fill Count");
+      WRITE_STR(stat->description,
+                "Number of scratch fill operations.  This gives a rough "
+                "estimate of the cost incurred due to spilling temporary "
+                "values to memory.  If this is non-zero, you may want to "
+                "adjust your shader to reduce register pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.fills;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Scratch Memory Size");
+      WRITE_STR(stat->description,
+                "Number of bytes of scratch memory required by the "
+                "generated shader executable.  If this is non-zero, you "
+                "may want to adjust your shader to reduce register "
+                "pressure.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = prog_data->total_scratch;
+   }
+
+   if (gl_shader_stage_uses_workgroup(exe->stage)) {
+      vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+         WRITE_STR(stat->name, "Workgroup Memory Size");
+         WRITE_STR(stat->description,
+                   "Number of bytes of workgroup shared memory used by this "
+                   "shader including any padding.");
+         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+         stat->value.u64 = prog_data->total_shared;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static bool
+write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
+              const char *data)
+{
+   ir->isText = VK_TRUE;
+
+   size_t data_len = strlen(data) + 1;
+
+   if (ir->pData == NULL) {
+      ir->dataSize = data_len;
+      return true;
+   }
+
+   strncpy(ir->pData, data, ir->dataSize);
+   if (ir->dataSize < data_len)
+      return false;
+
+   ir->dataSize = data_len;
+   return true;
+}
+
+VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
+    VkDevice                                    device,
+    const VkPipelineExecutableInfoKHR*          pExecutableInfo,
+    uint32_t*                                   pInternalRepresentationCount,
+    VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+                          pInternalRepresentations, pInternalRepresentationCount);
+   bool incomplete_text = false;
+
+   const struct anv_pipeline_executable *exe =
+      anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
+
+   if (exe->nir) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
+         WRITE_STR(ir->name, "Final NIR");
+         WRITE_STR(ir->description,
+                   "Final NIR before going into the back-end compiler");
+
+         if (!write_ir_text(ir, exe->nir))
+            incomplete_text = true;
+      }
+   }
+
+   if (exe->disasm) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
+         WRITE_STR(ir->name, "GEN Assembly");
+         WRITE_STR(ir->description,
+                   "Final GEN assembly for the generated shader binary");
+
+         if (!write_ir_text(ir, exe->disasm))
+            incomplete_text = true;
+      }
+   }
+
+   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
+}
+
+VkResult
+anv_GetRayTracingShaderGroupHandlesKHR(
+    VkDevice                                    _device,
+    VkPipeline                                  _pipeline,
+    uint32_t                                    firstGroup,
+    uint32_t                                    groupCount,
+    size_t                                      dataSize,
+    void*                                       pData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
+   if (pipeline->type != ANV_PIPELINE_RAY_TRACING)
+      return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+
+   struct anv_ray_tracing_pipeline *rt_pipeline =
+      anv_pipeline_to_ray_tracing(pipeline);
+
+   for (uint32_t i = 0; i < groupCount; i++) {
+      struct anv_rt_shader_group *group = &rt_pipeline->groups[firstGroup + i];
+      memcpy(pData, group->handle, sizeof(group->handle));
+      pData += sizeof(group->handle);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_GetRayTracingCaptureReplayShaderGroupHandlesKHR(
+    VkDevice                                    _device,
+    VkPipeline                                  pipeline,
+    uint32_t                                    firstGroup,
+    uint32_t                                    groupCount,
+    size_t                                      dataSize,
+    void*                                       pData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkDeviceSize
+anv_GetRayTracingShaderGroupStackSizeKHR(
+    VkDevice                                    device,
+    VkPipeline                                  _pipeline,
+    uint32_t                                    group,
+    VkShaderGroupShaderKHR                      groupShader)
+{
+   ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   assert(pipeline->type == ANV_PIPELINE_RAY_TRACING);
+
+   struct anv_ray_tracing_pipeline *rt_pipeline =
+      anv_pipeline_to_ray_tracing(pipeline);
+
+   assert(group < rt_pipeline->group_count);
+
+   struct anv_shader_bin *bin;
+   switch (groupShader) {
+   case VK_SHADER_GROUP_SHADER_GENERAL_KHR:
+      bin = rt_pipeline->groups[group].general;
+      break;
+
+   case VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR:
+      bin = rt_pipeline->groups[group].closest_hit;
+      break;
+
+   case VK_SHADER_GROUP_SHADER_ANY_HIT_KHR:
+      bin = rt_pipeline->groups[group].any_hit;
+      break;
+
+   case VK_SHADER_GROUP_SHADER_INTERSECTION_KHR:
+      bin = rt_pipeline->groups[group].intersection;
+      break;
+
+   default:
+      unreachable("Invalid VkShaderGroupShader enum");
+   }
+
+   if (bin == NULL)
+      return 0;
+
+   return brw_bs_prog_data_const(bin->prog_data)->max_stack_size;
+}
diff --git a/src/intel/vulkan_hasvk/anv_pipeline_cache.c b/src/intel/vulkan_hasvk/anv_pipeline_cache.c
new file mode 100644
index 00000000000..e85a362f7f4
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_pipeline_cache.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/blob.h"
+#include "util/hash_table.h"
+#include "util/debug.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+#include "nir/nir_serialize.h"
+#include "anv_private.h"
+#include "nir/nir_xfb_info.h"
+#include "vulkan/util/vk_util.h"
+
+static bool
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob);
+
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_device *device,
+                           const void *key_data, size_t key_size,
+                           struct blob_reader *blob);
+
+static void
+anv_shader_bin_destroy(struct vk_pipeline_cache_object *object)
+{
+   struct anv_device *device =
+      container_of(object->device, struct anv_device, vk);
+   struct anv_shader_bin *shader =
+      container_of(object, struct anv_shader_bin, base);
+
+   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   vk_pipeline_cache_object_finish(&shader->base);
+   vk_free(&device->vk.alloc, shader);
+}
+
+static const struct vk_pipeline_cache_object_ops anv_shader_bin_ops = {
+   .serialize = anv_shader_bin_serialize,
+   .deserialize = anv_shader_bin_deserialize,
+   .destroy = anv_shader_bin_destroy,
+};
+
+const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2] = {
+   &anv_shader_bin_ops,
+   NULL
+};
+
+struct anv_shader_bin *
+anv_shader_bin_create(struct anv_device *device,
+                      gl_shader_stage stage,
+                      const void *key_data, uint32_t key_size,
+                      const void *kernel_data, uint32_t kernel_size,
+                      const struct brw_stage_prog_data *prog_data_in,
+                      uint32_t prog_data_size,
+                      const struct brw_compile_stats *stats, uint32_t num_stats,
+                      const nir_xfb_info *xfb_info_in,
+                      const struct anv_pipeline_bind_map *bind_map)
+{
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct anv_shader_bin, shader, 1);
+   VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
+   VK_MULTIALLOC_DECL_SIZE(&ma, struct brw_stage_prog_data, prog_data,
+                                prog_data_size);
+   VK_MULTIALLOC_DECL(&ma, struct brw_shader_reloc, prog_data_relocs,
+                           prog_data_in->num_relocs);
+   VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params);
+
+   VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info,
+                                xfb_info_in == NULL ? 0 :
+                                nir_xfb_info_size(xfb_info_in->output_count));
+
+   VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surface_to_descriptor,
+                           bind_map->surface_count);
+   VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, sampler_to_descriptor,
+                           bind_map->sampler_count);
+
+   if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
+                            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
+      return NULL;
+
+   memcpy(obj_key_data, key_data, key_size);
+   vk_pipeline_cache_object_init(&device->vk, &shader->base,
+                                 &anv_shader_bin_ops, obj_key_data, key_size);
+
+   shader->stage = stage;
+
+   shader->kernel =
+      anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
+   memcpy(shader->kernel.map, kernel_data, kernel_size);
+   shader->kernel_size = kernel_size;
+
+   uint64_t shader_data_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
+                               shader->kernel.offset +
+                               prog_data_in->const_data_offset;
+
+   int rv_count = 0;
+   struct brw_shader_reloc_value reloc_values[5];
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
+      .value = shader_data_addr,
+   };
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
+      .value = shader_data_addr >> 32,
+   };
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_SHADER_START_OFFSET,
+      .value = shader->kernel.offset,
+   };
+   if (brw_shader_stage_is_bindless(stage)) {
+      const struct brw_bs_prog_data *bs_prog_data =
+         brw_bs_prog_data_const(prog_data_in);
+      uint64_t resume_sbt_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
+                                 shader->kernel.offset +
+                                 bs_prog_data->resume_sbt_offset;
+      reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+         .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
+         .value = resume_sbt_addr,
+      };
+      reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+         .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
+         .value = resume_sbt_addr >> 32,
+      };
+   }
+
+   brw_write_shader_relocs(&device->physical->compiler->isa,
+                           shader->kernel.map, prog_data_in,
+                           reloc_values, rv_count);
+
+   memcpy(prog_data, prog_data_in, prog_data_size);
+   typed_memcpy(prog_data_relocs, prog_data_in->relocs,
+                prog_data_in->num_relocs);
+   prog_data->relocs = prog_data_relocs;
+   memset(prog_data_param, 0,
+          prog_data->nr_params * sizeof(*prog_data_param));
+   prog_data->param = prog_data_param;
+   shader->prog_data = prog_data;
+   shader->prog_data_size = prog_data_size;
+
+   assert(num_stats <= ARRAY_SIZE(shader->stats));
+   typed_memcpy(shader->stats, stats, num_stats);
+   shader->num_stats = num_stats;
+
+   if (xfb_info_in) {
+      *xfb_info = *xfb_info_in;
+      typed_memcpy(xfb_info->outputs, xfb_info_in->outputs,
+                   xfb_info_in->output_count);
+      shader->xfb_info = xfb_info;
+   } else {
+      shader->xfb_info = NULL;
+   }
+
+   shader->bind_map = *bind_map;
+   typed_memcpy(surface_to_descriptor, bind_map->surface_to_descriptor,
+                bind_map->surface_count);
+   shader->bind_map.surface_to_descriptor = surface_to_descriptor;
+   typed_memcpy(sampler_to_descriptor, bind_map->sampler_to_descriptor,
+                bind_map->sampler_count);
+   shader->bind_map.sampler_to_descriptor = sampler_to_descriptor;
+
+   return shader;
+}
+
+static bool
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob)
+{
+   struct anv_shader_bin *shader =
+      container_of(object, struct anv_shader_bin, base);
+
+   blob_write_uint32(blob, shader->stage);
+
+   blob_write_uint32(blob, shader->kernel_size);
+   blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
+
+   blob_write_uint32(blob, shader->prog_data_size);
+   blob_write_bytes(blob, shader->prog_data, shader->prog_data_size);
+   blob_write_bytes(blob, shader->prog_data->relocs,
+                    shader->prog_data->num_relocs *
+                    sizeof(shader->prog_data->relocs[0]));
+
+   blob_write_uint32(blob, shader->num_stats);
+   blob_write_bytes(blob, shader->stats,
+                    shader->num_stats * sizeof(shader->stats[0]));
+
+   if (shader->xfb_info) {
+      uint32_t xfb_info_size =
+         nir_xfb_info_size(shader->xfb_info->output_count);
+      blob_write_uint32(blob, xfb_info_size);
+      blob_write_bytes(blob, shader->xfb_info, xfb_info_size);
+   } else {
+      blob_write_uint32(blob, 0);
+   }
+
+   blob_write_bytes(blob, shader->bind_map.surface_sha1,
+                    sizeof(shader->bind_map.surface_sha1));
+   blob_write_bytes(blob, shader->bind_map.sampler_sha1,
+                    sizeof(shader->bind_map.sampler_sha1));
+   blob_write_bytes(blob, shader->bind_map.push_sha1,
+                    sizeof(shader->bind_map.push_sha1));
+   blob_write_uint32(blob, shader->bind_map.surface_count);
+   blob_write_uint32(blob, shader->bind_map.sampler_count);
+   blob_write_bytes(blob, shader->bind_map.surface_to_descriptor,
+                    shader->bind_map.surface_count *
+                    sizeof(*shader->bind_map.surface_to_descriptor));
+   blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor,
+                    shader->bind_map.sampler_count *
+                    sizeof(*shader->bind_map.sampler_to_descriptor));
+   blob_write_bytes(blob, shader->bind_map.push_ranges,
+                    sizeof(shader->bind_map.push_ranges));
+
+   return !blob->out_of_memory;
+}
+
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_device *vk_device,
+                           const void *key_data, size_t key_size,
+                           struct blob_reader *blob)
+{
+   struct anv_device *device =
+      container_of(vk_device, struct anv_device, vk);
+
+   gl_shader_stage stage = blob_read_uint32(blob);
+
+   uint32_t kernel_size = blob_read_uint32(blob);
+   const void *kernel_data = blob_read_bytes(blob, kernel_size);
+
+   uint32_t prog_data_size = blob_read_uint32(blob);
+   const void *prog_data_bytes = blob_read_bytes(blob, prog_data_size);
+   if (blob->overrun)
+      return NULL;
+
+   union brw_any_prog_data prog_data;
+   memcpy(&prog_data, prog_data_bytes,
+          MIN2(sizeof(prog_data), prog_data_size));
+   prog_data.base.relocs =
+      blob_read_bytes(blob, prog_data.base.num_relocs *
+                            sizeof(prog_data.base.relocs[0]));
+
+   uint32_t num_stats = blob_read_uint32(blob);
+   const struct brw_compile_stats *stats =
+      blob_read_bytes(blob, num_stats * sizeof(stats[0]));
+
+   const nir_xfb_info *xfb_info = NULL;
+   uint32_t xfb_size = blob_read_uint32(blob);
+   if (xfb_size)
+      xfb_info = blob_read_bytes(blob, xfb_size);
+
+   struct anv_pipeline_bind_map bind_map;
+   blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1));
+   blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1));
+   blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1));
+   bind_map.surface_count = blob_read_uint32(blob);
+   bind_map.sampler_count = blob_read_uint32(blob);
+   bind_map.surface_to_descriptor = (void *)
+      blob_read_bytes(blob, bind_map.surface_count *
+                            sizeof(*bind_map.surface_to_descriptor));
+   bind_map.sampler_to_descriptor = (void *)
+      blob_read_bytes(blob, bind_map.sampler_count *
+                            sizeof(*bind_map.sampler_to_descriptor));
+   blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges));
+
+   if (blob->overrun)
+      return NULL;
+
+   struct anv_shader_bin *shader =
+      anv_shader_bin_create(device, stage,
+                            key_data, key_size,
+                            kernel_data, kernel_size,
+                            &prog_data.base, prog_data_size,
+                            stats, num_stats, xfb_info, &bind_map);
+   if (shader == NULL)
+      return NULL;
+
+   return &shader->base;
+}
+
+struct anv_shader_bin *
+anv_device_search_for_kernel(struct anv_device *device,
+                             struct vk_pipeline_cache *cache,
+                             const void *key_data, uint32_t key_size,
+                             bool *user_cache_hit)
+{
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
+
+   bool cache_hit = false;
+   struct vk_pipeline_cache_object *object =
+      vk_pipeline_cache_lookup_object(cache, key_data, key_size,
+                                      &anv_shader_bin_ops, &cache_hit);
+   if (user_cache_hit != NULL) {
+      *user_cache_hit = object != NULL && cache_hit &&
+                        cache != device->default_pipeline_cache;
+   }
+   if (object == NULL)
+      return NULL;
+
+   return container_of(object, struct anv_shader_bin, base);
+}
+
+struct anv_shader_bin *
+anv_device_upload_kernel(struct anv_device *device,
+                         struct vk_pipeline_cache *cache,
+                         gl_shader_stage stage,
+                         const void *key_data, uint32_t key_size,
+                         const void *kernel_data, uint32_t kernel_size,
+                         const struct brw_stage_prog_data *prog_data,
+                         uint32_t prog_data_size,
+                         const struct brw_compile_stats *stats,
+                         uint32_t num_stats,
+                         const nir_xfb_info *xfb_info,
+                         const struct anv_pipeline_bind_map *bind_map)
+{
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
+
+   struct anv_shader_bin *shader =
+      anv_shader_bin_create(device, stage,
+                            key_data, key_size,
+                            kernel_data, kernel_size,
+                            prog_data, prog_data_size,
+                            stats, num_stats,
+                            xfb_info, bind_map);
+   if (shader == NULL)
+      return NULL;
+
+   struct vk_pipeline_cache_object *cached =
+      vk_pipeline_cache_add_object(cache, &shader->base);
+
+   return container_of(cached, struct anv_shader_bin, base);
+}
+
+#define SHA1_KEY_SIZE 20
+
+struct nir_shader *
+anv_device_search_for_nir(struct anv_device *device,
+                          struct vk_pipeline_cache *cache,
+                          const nir_shader_compiler_options *nir_options,
+                          unsigned char sha1_key[SHA1_KEY_SIZE],
+                          void *mem_ctx)
+{
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
+
+   return vk_pipeline_cache_lookup_nir(cache, sha1_key, SHA1_KEY_SIZE,
+                                       nir_options, NULL, mem_ctx);
+}
+
+void
+anv_device_upload_nir(struct anv_device *device,
+                      struct vk_pipeline_cache *cache,
+                      const struct nir_shader *nir,
+                      unsigned char sha1_key[SHA1_KEY_SIZE])
+{
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
+
+   vk_pipeline_cache_add_nir(cache, sha1_key, SHA1_KEY_SIZE, nir);
+}
diff --git a/src/intel/vulkan_hasvk/anv_private.h b/src/intel/vulkan_hasvk/anv_private.h
new file mode 100644
index 00000000000..c00c4565142
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_private.h
@@ -0,0 +1,4303 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_PRIVATE_H
+#define ANV_PRIVATE_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <assert.h>
+#include <stdint.h>
+#include "drm-uapi/i915_drm.h"
+#include "drm-uapi/drm_fourcc.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x) ((void)0)
+#endif
+
+#include "common/intel_clflush.h"
+#include "common/intel_decoder.h"
+#include "common/intel_gem.h"
+#include "common/intel_l3_config.h"
+#include "common/intel_measure.h"
+#include "common/intel_sample_positions.h"
+#include "dev/intel_device_info.h"
+#include "blorp/blorp.h"
+#include "compiler/brw_compiler.h"
+#include "compiler/brw_rt.h"
+#include "ds/intel_driver_ds.h"
+#include "util/bitset.h"
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/perf/u_trace.h"
+#include "util/sparse_array.h"
+#include "util/u_atomic.h"
+#include "util/u_vector.h"
+#include "util/u_math.h"
+#include "util/vma.h"
+#include "util/xmlconfig.h"
+#include "vk_alloc.h"
+#include "vk_buffer.h"
+#include "vk_command_buffer.h"
+#include "vk_command_pool.h"
+#include "vk_debug_report.h"
+#include "vk_device.h"
+#include "vk_drm_syncobj.h"
+#include "vk_enum_defines.h"
+#include "vk_framebuffer.h"
+#include "vk_graphics_state.h"
+#include "vk_image.h"
+#include "vk_instance.h"
+#include "vk_pipeline_cache.h"
+#include "vk_physical_device.h"
+#include "vk_shader_module.h"
+#include "vk_sync.h"
+#include "vk_sync_timeline.h"
+#include "vk_util.h"
+#include "vk_queue.h"
+#include "vk_log.h"
+
+/* Pre-declarations needed for WSI entrypoints */
+struct wl_surface;
+struct wl_display;
+typedef struct xcb_connection_t xcb_connection_t;
+typedef uint32_t xcb_visualid_t;
+typedef uint32_t xcb_window_t;
+
+struct anv_batch;
+struct anv_buffer;
+struct anv_buffer_view;
+struct anv_image_view;
+struct anv_acceleration_structure;
+struct anv_instance;
+
+struct intel_aux_map_context;
+struct intel_perf_config;
+struct intel_perf_counter_pass;
+struct intel_perf_query_result;
+
+#include <vulkan/vulkan.h>
+#include <vulkan/vk_icd.h>
+
+#include "anv_android.h"
+#include "anv_entrypoints.h"
+#include "isl/isl.h"
+
+#include "dev/intel_debug.h"
+#undef MESA_LOG_TAG
+#define MESA_LOG_TAG "MESA-INTEL"
+#include "util/log.h"
+#include "wsi_common.h"
+
+#define NSEC_PER_SEC 1000000000ull
+
+/* anv Virtual Memory Layout
+ * =========================
+ *
+ * When the anv driver is determining the virtual graphics addresses of memory
+ * objects itself using the softpin mechanism, the following memory ranges
+ * will be used.
+ *
+ * Three special considerations to notice:
+ *
+ * (1) the dynamic state pool is located within the same 4 GiB as the low
+ * heap. This is to work around a VF cache issue described in a comment in
+ * anv_physical_device_init_heaps.
+ *
+ * (2) the binding table pool is located at lower addresses than the surface
+ * state pool, within a 4 GiB range. This allows surface state base addresses
+ * to cover both binding tables (16 bit offsets) and surface states (32 bit
+ * offsets).
+ *
+ * (3) the last 4 GiB of the address space is withheld from the high
+ * heap. Various hardware units will read past the end of an object for
+ * various reasons. This healthy margin prevents reads from wrapping around
+ * 48-bit addresses.
+ */
+#define GENERAL_STATE_POOL_MIN_ADDRESS     0x000000200000ULL /* 2 MiB */
+#define GENERAL_STATE_POOL_MAX_ADDRESS     0x00003fffffffULL
+#define LOW_HEAP_MIN_ADDRESS               0x000040000000ULL /* 1 GiB */
+#define LOW_HEAP_MAX_ADDRESS               0x00007fffffffULL
+#define DYNAMIC_STATE_POOL_MIN_ADDRESS     0x0000c0000000ULL /* 3 GiB */
+#define DYNAMIC_STATE_POOL_MAX_ADDRESS     0x0000ffffffffULL
+#define BINDING_TABLE_POOL_MIN_ADDRESS     0x000100000000ULL /* 4 GiB */
+#define BINDING_TABLE_POOL_MAX_ADDRESS     0x00013fffffffULL
+#define SURFACE_STATE_POOL_MIN_ADDRESS     0x000140000000ULL /* 5 GiB */
+#define SURFACE_STATE_POOL_MAX_ADDRESS     0x00017fffffffULL
+#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
+#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
+#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS    0x0001c0000000ULL /* 7 GiB */
+#define CLIENT_VISIBLE_HEAP_MAX_ADDRESS    0x0002bfffffffULL
+#define HIGH_HEAP_MIN_ADDRESS              0x0002c0000000ULL /* 11 GiB */
+
+#define GENERAL_STATE_POOL_SIZE     \
+   (GENERAL_STATE_POOL_MAX_ADDRESS - GENERAL_STATE_POOL_MIN_ADDRESS + 1)
+#define LOW_HEAP_SIZE               \
+   (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
+#define DYNAMIC_STATE_POOL_SIZE     \
+   (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_SIZE     \
+   (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_BLOCK_SIZE (65536)
+#define SURFACE_STATE_POOL_SIZE     \
+   (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1)
+#define INSTRUCTION_STATE_POOL_SIZE \
+   (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
+#define CLIENT_VISIBLE_HEAP_SIZE               \
+   (CLIENT_VISIBLE_HEAP_MAX_ADDRESS - CLIENT_VISIBLE_HEAP_MIN_ADDRESS + 1)
+
+/* Allowing different clear colors requires us to perform a depth resolve at
+ * the end of certain render passes. This is because while slow clears store
+ * the clear color in the HiZ buffer, fast clears (without a resolve) don't.
+ * See the PRMs for examples describing when additional resolves would be
+ * necessary. To enable fast clears without requiring extra resolves, we set
+ * the clear value to a globally-defined one. We could allow different values
+ * if the user doesn't expect coherent data during or after a render passes
+ * (VK_ATTACHMENT_STORE_OP_DONT_CARE), but such users (aside from the CTS)
+ * don't seem to exist yet. In almost all Vulkan applications tested thus far,
+ * 1.0f seems to be the only value used. The only application that doesn't set
+ * this value does so through the usage of an seemingly uninitialized clear
+ * value.
+ */
+#define ANV_HZ_FC_VAL 1.0f
+
+/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, we use 2 for base & drawid SGVs */
+#define MAX_VBS         (33 - 2)
+
+/* 3DSTATE_VERTEX_ELEMENTS supports up to 34 VEs, but our backend compiler
+ * only supports the push model of VS inputs, and we only have 128 GRFs,
+ * minus the g0 and g1 payload, which gives us a maximum of 31 VEs.  Plus,
+ * we use two of them for SGVs.
+ */
+#define MAX_VES         (31 - 2)
+
+#define MAX_XFB_BUFFERS  4
+#define MAX_XFB_STREAMS  4
+#define MAX_SETS        32
+#define MAX_RTS          8
+#define MAX_VIEWPORTS   16
+#define MAX_SCISSORS    16
+#define MAX_PUSH_CONSTANTS_SIZE 128
+#define MAX_DYNAMIC_BUFFERS 16
+#define MAX_IMAGES 64
+#define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
+#define MAX_INLINE_UNIFORM_BLOCK_SIZE 4096
+#define MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS 32
+/* We need 16 for UBO block reads to work and 32 for push UBOs. However, we
+ * use 64 here to avoid cache issues. This could most likely bring it back to
+ * 32 if we had different virtual addresses for the different views on a given
+ * GEM object.
+ */
+#define ANV_UBO_ALIGNMENT 64
+#define ANV_SSBO_ALIGNMENT 4
+#define ANV_SSBO_BOUNDS_CHECK_ALIGNMENT 4
+#define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
+#define MAX_SAMPLE_LOCATIONS 16
+
+/* From the Skylake PRM Vol. 7 "Binding Table Surface State Model":
+ *
+ *    "The surface state model is used when a Binding Table Index (specified
+ *    in the message descriptor) of less than 240 is specified. In this model,
+ *    the Binding Table Index is used to index into the binding table, and the
+ *    binding table entry contains a pointer to the SURFACE_STATE."
+ *
+ * Binding table values above 240 are used for various things in the hardware
+ * such as stateless, stateless with incoherent cache, SLM, and bindless.
+ */
+#define MAX_BINDING_TABLE_SIZE 240
+
+/* The kernel relocation API has a limitation of a 32-bit delta value
+ * applied to the address before it is written which, in spite of it being
+ * unsigned, is treated as signed .  Because of the way that this maps to
+ * the Vulkan API, we cannot handle an offset into a buffer that does not
+ * fit into a signed 32 bits.  The only mechanism we have for dealing with
+ * this at the moment is to limit all VkDeviceMemory objects to a maximum
+ * of 2GB each.  The Vulkan spec allows us to do this:
+ *
+ *    "Some platforms may have a limit on the maximum size of a single
+ *    allocation. For example, certain systems may fail to create
+ *    allocations with a size greater than or equal to 4GB. Such a limit is
+ *    implementation-dependent, and if such a failure occurs then the error
+ *    VK_ERROR_OUT_OF_DEVICE_MEMORY should be returned."
+ */
+#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 31)
+
+#define ANV_SVGS_VB_INDEX    MAX_VBS
+#define ANV_DRAWID_VB_INDEX (MAX_VBS + 1)
+
+/* We reserve this MI ALU register for the purpose of handling predication.
+ * Other code which uses the MI ALU should leave it alone.
+ */
+#define ANV_PREDICATE_RESULT_REG 0x2678 /* MI_ALU_REG15 */
+
+/* We reserve this MI ALU register to pass around an offset computed from
+ * VkPerformanceQuerySubmitInfoKHR::counterPassIndex VK_KHR_performance_query.
+ * Other code which uses the MI ALU should leave it alone.
+ */
+#define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */
+
+#define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1)
+
+/* For gfx12 we set the streamout buffers using 4 separate commands
+ * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout
+ * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of
+ * 3DSTATE_SO_BUFFER apart from the SOBufferIndex field, so for now we use the
+ * 3DSTATE_SO_BUFFER command, but change the 3DCommandSubOpcode.
+ * SO_BUFFER_INDEX_0_CMD is actually the 3DCommandSubOpcode for
+ * 3DSTATE_SO_BUFFER_INDEX_0.
+ */
+#define SO_BUFFER_INDEX_0_CMD 0x60
+#define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
+
+static inline uint32_t
+align_down_npot_u32(uint32_t v, uint32_t a)
+{
+   return v - (v % a);
+}
+
+static inline uint32_t
+align_down_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return v & ~(a - 1);
+}
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return align_down_u32(v + a - 1, a);
+}
+
+static inline uint64_t
+align_down_u64(uint64_t v, uint64_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return v & ~(a - 1);
+}
+
+static inline uint64_t
+align_u64(uint64_t v, uint64_t a)
+{
+   return align_down_u64(v + a - 1, a);
+}
+
+static inline int32_t
+align_i32(int32_t v, int32_t a)
+{
+   assert(a != 0 && a == (a & -a));
+   return (v + a - 1) & ~(a - 1);
+}
+
+/** Alignment must be a power of 2. */
+static inline bool
+anv_is_aligned(uintmax_t n, uintmax_t a)
+{
+   assert(a == (a & -a));
+   return (n & (a - 1)) == 0;
+}
+
+static inline uint32_t
+anv_minify(uint32_t n, uint32_t levels)
+{
+   if (unlikely(n == 0))
+      return 0;
+   else
+      return MAX2(n >> levels, 1);
+}
+
+static inline float
+anv_clamp_f(float f, float min, float max)
+{
+   assert(min < max);
+
+   if (f > max)
+      return max;
+   else if (f < min)
+      return min;
+   else
+      return f;
+}
+
+static inline bool
+anv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
+{
+   if (*inout_mask & clear_mask) {
+      *inout_mask &= ~clear_mask;
+      return true;
+   } else {
+      return false;
+   }
+}
+
+static inline union isl_color_value
+vk_to_isl_color(VkClearColorValue color)
+{
+   return (union isl_color_value) {
+      .u32 = {
+         color.uint32[0],
+         color.uint32[1],
+         color.uint32[2],
+         color.uint32[3],
+      },
+   };
+}
+
+static inline union isl_color_value
+vk_to_isl_color_with_format(VkClearColorValue color, enum isl_format format)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+   union isl_color_value isl_color = { .u32 = {0, } };
+
+#define COPY_COLOR_CHANNEL(c, i) \
+   if (fmtl->channels.c.bits) \
+      isl_color.u32[i] = color.uint32[i]
+
+   COPY_COLOR_CHANNEL(r, 0);
+   COPY_COLOR_CHANNEL(g, 1);
+   COPY_COLOR_CHANNEL(b, 2);
+   COPY_COLOR_CHANNEL(a, 3);
+
+#undef COPY_COLOR_CHANNEL
+
+   return isl_color;
+}
+
+static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags)
+{
+   uintptr_t mask = (1ull << bits) - 1;
+   *flags = ptr & mask;
+   return (void *) (ptr & ~mask);
+}
+
+static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags)
+{
+   uintptr_t value = (uintptr_t) ptr;
+   uintptr_t mask = (1ull << bits) - 1;
+   return value | (mask & flags);
+}
+
+/**
+ * Warn on ignored extension structs.
+ *
+ * The Vulkan spec requires us to ignore unsupported or unknown structs in
+ * a pNext chain.  In debug mode, emitting warnings for ignored structs may
+ * help us discover structs that we should not have ignored.
+ *
+ *
+ * From the Vulkan 1.0.38 spec:
+ *
+ *    Any component of the implementation (the loader, any enabled layers,
+ *    and drivers) must skip over, without processing (other than reading the
+ *    sType and pNext members) any chained structures with sType values not
+ *    defined by extensions supported by that component.
+ */
+#define anv_debug_ignored_stype(sType) \
+   mesa_logd("%s: ignored VkStructureType %u\n", __func__, (sType))
+
+void __anv_perf_warn(struct anv_device *device,
+                     const struct vk_object_base *object,
+                     const char *file, int line, const char *format, ...)
+   anv_printflike(5, 6);
+
+/**
+ * Print a FINISHME message, including its source location.
+ */
+#define anv_finishme(format, ...) \
+   do { \
+      static bool reported = false; \
+      if (!reported) { \
+         mesa_logw("%s:%d: FINISHME: " format, __FILE__, __LINE__, \
+                    ##__VA_ARGS__); \
+         reported = true; \
+      } \
+   } while (0)
+
+/**
+ * Print a perf warning message.  Set INTEL_DEBUG=perf to see these.
+ */
+#define anv_perf_warn(objects_macro, format, ...)   \
+   do { \
+      static bool reported = false; \
+      if (!reported && INTEL_DEBUG(DEBUG_PERF)) { \
+         __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,      \
+                  VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,      \
+                  objects_macro, __FILE__, __LINE__,                    \
+                  format, ## __VA_ARGS__);                              \
+         reported = true; \
+      } \
+   } while (0)
+
+/* A non-fatal assert.  Useful for debugging. */
+#ifdef DEBUG
+#define anv_assert(x) ({ \
+   if (unlikely(!(x))) \
+      mesa_loge("%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \
+})
+#else
+#define anv_assert(x)
+#endif
+
+struct anv_bo {
+   const char *name;
+
+   uint32_t gem_handle;
+
+   uint32_t refcount;
+
+   /* Index into the current validation list.  This is used by the
+    * validation list building algorithm to track which buffers are already
+    * in the validation list so that we can ensure uniqueness.
+    */
+   uint32_t exec_obj_index;
+
+   /* Index for use with util_sparse_array_free_list */
+   uint32_t free_index;
+
+   /* Last known offset.  This value is provided by the kernel when we
+    * execbuf and is used as the presumed offset for the next bunch of
+    * relocations.
+    */
+   uint64_t offset;
+
+   /** Size of the buffer not including implicit aux */
+   uint64_t size;
+
+   /* Map for internally mapped BOs.
+    *
+    * If ANV_BO_ALLOC_MAPPED is set in flags, this is the map for the whole
+    * BO. If ANV_BO_WRAPPER is set in flags, map points to the wrapped BO.
+    */
+   void *map;
+
+   /** Size of the implicit CCS range at the end of the buffer
+    *
+    * On Gfx12, CCS data is always a direct 1/256 scale-down.  A single 64K
+    * page of main surface data maps to a 256B chunk of CCS data and that
+    * mapping is provided on TGL-LP by the AUX table which maps virtual memory
+    * addresses in the main surface to virtual memory addresses for CCS data.
+    *
+    * Because we can't change these maps around easily and because Vulkan
+    * allows two VkImages to be bound to overlapping memory regions (as long
+    * as the app is careful), it's not feasible to make this mapping part of
+    * the image.  (On Gfx11 and earlier, the mapping was provided via
+    * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.)
+    * Instead, we attach the CCS data directly to the buffer object and setup
+    * the AUX table mapping at BO creation time.
+    *
+    * This field is for internal tracking use by the BO allocator only and
+    * should not be touched by other parts of the code.  If something wants to
+    * know if a BO has implicit CCS data, it should instead look at the
+    * has_implicit_ccs boolean below.
+    *
+    * This data is not included in maps of this buffer.
+    */
+   uint32_t _ccs_size;
+
+   /** Flags to pass to the kernel through drm_i915_exec_object2::flags */
+   uint32_t flags;
+
+   /** True if this BO may be shared with other processes */
+   bool is_external:1;
+
+   /** True if this BO is a wrapper
+    *
+    * When set to true, none of the fields in this BO are meaningful except
+    * for anv_bo::is_wrapper and anv_bo::map which points to the actual BO.
+    * See also anv_bo_unwrap().  Wrapper BOs are not allowed when use_softpin
+    * is set in the physical device.
+    */
+   bool is_wrapper:1;
+
+   /** See also ANV_BO_ALLOC_FIXED_ADDRESS */
+   bool has_fixed_address:1;
+
+   /** True if this BO wraps a host pointer */
+   bool from_host_ptr:1;
+
+   /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */
+   bool has_client_visible_address:1;
+
+   /** True if this BO has implicit CCS data attached to it */
+   bool has_implicit_ccs:1;
+};
+
+static inline struct anv_bo *
+anv_bo_ref(struct anv_bo *bo)
+{
+   p_atomic_inc(&bo->refcount);
+   return bo;
+}
+
+static inline struct anv_bo *
+anv_bo_unwrap(struct anv_bo *bo)
+{
+   while (bo->is_wrapper)
+      bo = bo->map;
+   return bo;
+}
+
+static inline bool
+anv_bo_is_pinned(struct anv_bo *bo)
+{
+#if defined(GFX_VERx10) && GFX_VERx10 >= 90
+   /* Sky Lake and later always uses softpin */
+   assert(bo->flags & EXEC_OBJECT_PINNED);
+   return true;
+#elif defined(GFX_VERx10) && GFX_VERx10 < 80
+   /* Haswell and earlier never use softpin */
+   assert(!(bo->flags & EXEC_OBJECT_PINNED));
+   assert(!bo->has_fixed_address);
+   return false;
+#else
+   /* If we don't have a GFX_VERx10 #define, we need to look at the BO.  Also,
+    * for GFX version 8, we need to look at the BO because Broadwell softpins
+    * but Cherryview doesn't.
+    */
+   assert((bo->flags & EXEC_OBJECT_PINNED) || !bo->has_fixed_address);
+   return (bo->flags & EXEC_OBJECT_PINNED) != 0;
+#endif
+}
+
+struct anv_address {
+   struct anv_bo *bo;
+   int64_t offset;
+};
+
+#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
+
+static inline struct anv_address
+anv_address_from_u64(uint64_t addr_u64)
+{
+   assert(addr_u64 == intel_canonical_address(addr_u64));
+   return (struct anv_address) {
+      .bo = NULL,
+      .offset = addr_u64,
+   };
+}
+
+static inline bool
+anv_address_is_null(struct anv_address addr)
+{
+   return addr.bo == NULL && addr.offset == 0;
+}
+
+static inline uint64_t
+anv_address_physical(struct anv_address addr)
+{
+   if (addr.bo && anv_bo_is_pinned(addr.bo)) {
+      return intel_canonical_address(addr.bo->offset + addr.offset);
+   } else {
+      return intel_canonical_address(addr.offset);
+   }
+}
+
+static inline struct anv_address
+anv_address_add(struct anv_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
+
+/* Represents a lock-free linked list of "free" things.  This is used by
+ * both the block pool and the state pools.  Unfortunately, in order to
+ * solve the ABA problem, we can't use a single uint32_t head.
+ */
+union anv_free_list {
+   struct {
+      uint32_t offset;
+
+      /* A simple count that is incremented every time the head changes. */
+      uint32_t count;
+   };
+   /* Make sure it's aligned to 64 bits. This will make atomic operations
+    * faster on 32 bit platforms.
+    */
+   uint64_t u64 __attribute__ ((aligned (8)));
+};
+
+#define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { UINT32_MAX, 0 } })
+
+struct anv_block_state {
+   union {
+      struct {
+         uint32_t next;
+         uint32_t end;
+      };
+      /* Make sure it's aligned to 64 bits. This will make atomic operations
+       * faster on 32 bit platforms.
+       */
+      uint64_t u64 __attribute__ ((aligned (8)));
+   };
+};
+
+#define anv_block_pool_foreach_bo(bo, pool)  \
+   for (struct anv_bo **_pp_bo = (pool)->bos, *bo; \
+        _pp_bo != &(pool)->bos[(pool)->nbos] && (bo = *_pp_bo, true); \
+        _pp_bo++)
+
+#define ANV_MAX_BLOCK_POOL_BOS 20
+
+struct anv_block_pool {
+   const char *name;
+
+   struct anv_device *device;
+   bool use_relocations;
+
+   /* Wrapper BO for use in relocation lists.  This BO is simply a wrapper
+    * around the actual BO so that we grow the pool after the wrapper BO has
+    * been put in a relocation list.  This is only used in the non-softpin
+    * case.
+    */
+   struct anv_bo wrapper_bo;
+
+   struct anv_bo *bos[ANV_MAX_BLOCK_POOL_BOS];
+   struct anv_bo *bo;
+   uint32_t nbos;
+
+   uint64_t size;
+
+   /* The address where the start of the pool is pinned. The various bos that
+    * are created as the pool grows will have addresses in the range
+    * [start_address, start_address + BLOCK_POOL_MEMFD_SIZE).
+    */
+   uint64_t start_address;
+
+   /* The offset from the start of the bo to the "center" of the block
+    * pool.  Pointers to allocated blocks are given by
+    * bo.map + center_bo_offset + offsets.
+    */
+   uint32_t center_bo_offset;
+
+   /* Current memory map of the block pool.  This pointer may or may not
+    * point to the actual beginning of the block pool memory.  If
+    * anv_block_pool_alloc_back has ever been called, then this pointer
+    * will point to the "center" position of the buffer and all offsets
+    * (negative or positive) given out by the block pool alloc functions
+    * will be valid relative to this pointer.
+    *
+    * In particular, map == bo.map + center_offset
+    *
+    * DO NOT access this pointer directly. Use anv_block_pool_map() instead,
+    * since it will handle the softpin case as well, where this points to NULL.
+    */
+   void *map;
+   int fd;
+
+   /**
+    * Array of mmaps and gem handles owned by the block pool, reclaimed when
+    * the block pool is destroyed.
+    */
+   struct u_vector mmap_cleanups;
+
+   struct anv_block_state state;
+
+   struct anv_block_state back_state;
+};
+
+/* Block pools are backed by a fixed-size 1GB memfd */
+#define BLOCK_POOL_MEMFD_SIZE (1ul << 30)
+
+/* The center of the block pool is also the middle of the memfd.  This may
+ * change in the future if we decide differently for some reason.
+ */
+#define BLOCK_POOL_MEMFD_CENTER (BLOCK_POOL_MEMFD_SIZE / 2)
+
+static inline uint32_t
+anv_block_pool_size(struct anv_block_pool *pool)
+{
+   return pool->state.end + pool->back_state.end;
+}
+
+struct anv_state {
+   int32_t offset;
+   uint32_t alloc_size;
+   void *map;
+   uint32_t idx;
+};
+
+#define ANV_STATE_NULL ((struct anv_state) { .alloc_size = 0 })
+
+struct anv_fixed_size_state_pool {
+   union anv_free_list free_list;
+   struct anv_block_state block;
+};
+
+#define ANV_MIN_STATE_SIZE_LOG2 6
+#define ANV_MAX_STATE_SIZE_LOG2 21
+
+#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1)
+
+struct anv_free_entry {
+   uint32_t next;
+   struct anv_state state;
+};
+
+struct anv_state_table {
+   struct anv_device *device;
+   int fd;
+   struct anv_free_entry *map;
+   uint32_t size;
+   struct anv_block_state state;
+   struct u_vector cleanups;
+};
+
+struct anv_state_pool {
+   struct anv_block_pool block_pool;
+
+   /* Offset into the relevant state base address where the state pool starts
+    * allocating memory.
+    */
+   int32_t start_offset;
+
+   struct anv_state_table table;
+
+   /* The size of blocks which will be allocated from the block pool */
+   uint32_t block_size;
+
+   /** Free list for "back" allocations */
+   union anv_free_list back_alloc_free_list;
+
+   struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS];
+};
+
+struct anv_state_reserved_pool {
+   struct anv_state_pool *pool;
+   union anv_free_list reserved_blocks;
+   uint32_t count;
+};
+
+struct anv_state_stream {
+   struct anv_state_pool *state_pool;
+
+   /* The size of blocks to allocate from the state pool */
+   uint32_t block_size;
+
+   /* Current block we're allocating from */
+   struct anv_state block;
+
+   /* Offset into the current block at which to allocate the next state */
+   uint32_t next;
+
+   /* List of all blocks allocated from this pool */
+   struct util_dynarray all_blocks;
+};
+
+/* The block_pool functions exported for testing only.  The block pool should
+ * only be used via a state pool (see below).
+ */
+VkResult anv_block_pool_init(struct anv_block_pool *pool,
+                             struct anv_device *device,
+                             const char *name,
+                             uint64_t start_address,
+                             uint32_t initial_size);
+void anv_block_pool_finish(struct anv_block_pool *pool);
+int32_t anv_block_pool_alloc(struct anv_block_pool *pool,
+                             uint32_t block_size, uint32_t *padding);
+int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool,
+                                  uint32_t block_size);
+void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t
+size);
+
+VkResult anv_state_pool_init(struct anv_state_pool *pool,
+                             struct anv_device *device,
+                             const char *name,
+                             uint64_t base_address,
+                             int32_t start_offset,
+                             uint32_t block_size);
+void anv_state_pool_finish(struct anv_state_pool *pool);
+struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool,
+                                      uint32_t state_size, uint32_t alignment);
+struct anv_state anv_state_pool_alloc_back(struct anv_state_pool *pool);
+void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state);
+void anv_state_stream_init(struct anv_state_stream *stream,
+                           struct anv_state_pool *state_pool,
+                           uint32_t block_size);
+void anv_state_stream_finish(struct anv_state_stream *stream);
+struct anv_state anv_state_stream_alloc(struct anv_state_stream *stream,
+                                        uint32_t size, uint32_t alignment);
+
+void anv_state_reserved_pool_init(struct anv_state_reserved_pool *pool,
+                                      struct anv_state_pool *parent,
+                                      uint32_t count, uint32_t size,
+                                      uint32_t alignment);
+void anv_state_reserved_pool_finish(struct anv_state_reserved_pool *pool);
+struct anv_state anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *pool);
+void anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
+                                  struct anv_state state);
+
+VkResult anv_state_table_init(struct anv_state_table *table,
+                             struct anv_device *device,
+                             uint32_t initial_entries);
+void anv_state_table_finish(struct anv_state_table *table);
+VkResult anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
+                             uint32_t count);
+void anv_free_list_push(union anv_free_list *list,
+                        struct anv_state_table *table,
+                        uint32_t idx, uint32_t count);
+struct anv_state* anv_free_list_pop(union anv_free_list *list,
+                                    struct anv_state_table *table);
+
+
+static inline struct anv_state *
+anv_state_table_get(struct anv_state_table *table, uint32_t idx)
+{
+   return &table->map[idx].state;
+}
+/**
+ * Implements a pool of re-usable BOs.  The interface is identical to that
+ * of block_pool except that each block is its own BO.
+ */
+struct anv_bo_pool {
+   const char *name;
+
+   struct anv_device *device;
+
+   struct util_sparse_array_free_list free_list[16];
+};
+
+void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
+                      const char *name);
+void anv_bo_pool_finish(struct anv_bo_pool *pool);
+VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
+                           struct anv_bo **bo_out);
+void anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo);
+
+struct anv_scratch_pool {
+   /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
+   struct anv_bo *bos[16][MESA_SHADER_STAGES];
+   uint32_t surfs[16];
+   struct anv_state surf_states[16];
+};
+
+void anv_scratch_pool_init(struct anv_device *device,
+                           struct anv_scratch_pool *pool);
+void anv_scratch_pool_finish(struct anv_device *device,
+                             struct anv_scratch_pool *pool);
+struct anv_bo *anv_scratch_pool_alloc(struct anv_device *device,
+                                      struct anv_scratch_pool *pool,
+                                      gl_shader_stage stage,
+                                      unsigned per_thread_scratch);
+uint32_t anv_scratch_pool_get_surf(struct anv_device *device,
+                                   struct anv_scratch_pool *pool,
+                                   unsigned per_thread_scratch);
+
+/** Implements a BO cache that ensures a 1-1 mapping of GEM BOs to anv_bos */
+struct anv_bo_cache {
+   struct util_sparse_array bo_map;
+   pthread_mutex_t mutex;
+};
+
+VkResult anv_bo_cache_init(struct anv_bo_cache *cache,
+                           struct anv_device *device);
+void anv_bo_cache_finish(struct anv_bo_cache *cache);
+
+struct anv_queue_family {
+   /* Standard bits passed on to the client */
+   VkQueueFlags   queueFlags;
+   uint32_t       queueCount;
+
+   /* Driver internal information */
+   enum drm_i915_gem_engine_class engine_class;
+};
+
+#define ANV_MAX_QUEUE_FAMILIES 3
+
+struct anv_memory_type {
+   /* Standard bits passed on to the client */
+   VkMemoryPropertyFlags   propertyFlags;
+   uint32_t                heapIndex;
+};
+
+struct anv_memory_heap {
+   /* Standard bits passed on to the client */
+   VkDeviceSize      size;
+   VkMemoryHeapFlags flags;
+
+   /** Driver-internal book-keeping.
+    *
+    * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
+    */
+   VkDeviceSize      used __attribute__ ((aligned (8)));
+
+   bool              is_local_mem;
+};
+
+struct anv_memregion {
+   struct drm_i915_gem_memory_class_instance region;
+   uint64_t size;
+   uint64_t available;
+};
+
+struct anv_physical_device {
+    struct vk_physical_device                   vk;
+
+    /* Link in anv_instance::physical_devices */
+    struct list_head                            link;
+
+    struct anv_instance *                       instance;
+    char                                        path[20];
+    struct intel_device_info                      info;
+    /** Amount of "GPU memory" we want to advertise
+     *
+     * Clearly, this value is bogus since Intel is a UMA architecture.  On
+     * gfx7 platforms, we are limited by GTT size unless we want to implement
+     * fine-grained tracking and GTT splitting.  On Broadwell and above we are
+     * practically unlimited.  However, we will never report more than 3/4 of
+     * the total system ram to try and avoid running out of RAM.
+     */
+    bool                                        supports_48bit_addresses;
+    struct brw_compiler *                       compiler;
+    struct isl_device                           isl_dev;
+    struct intel_perf_config *                    perf;
+   /* True if hardware support is incomplete/alpha */
+    bool                                        is_alpha;
+    /*
+     * Number of commands required to implement a performance query begin +
+     * end.
+     */
+    uint32_t                                    n_perf_query_commands;
+    int                                         cmd_parser_version;
+    bool                                        has_exec_async;
+    bool                                        has_exec_capture;
+    int                                         max_context_priority;
+    bool                                        has_context_isolation;
+    bool                                        has_mmap_offset;
+    bool                                        has_userptr_probe;
+    uint64_t                                    gtt_size;
+
+    bool                                        use_relocations;
+    bool                                        use_softpin;
+    bool                                        always_use_bindless;
+    bool                                        use_call_secondary;
+
+    /** True if we can access buffers using A64 messages */
+    bool                                        has_a64_buffer_access;
+    /** True if we can use bindless access for images */
+    bool                                        has_bindless_images;
+    /** True if we can use bindless access for samplers */
+    bool                                        has_bindless_samplers;
+    /** True if we can use timeline semaphores through execbuf */
+    bool                                        has_exec_timeline;
+
+    /** True if we can read the GPU timestamp register
+     *
+     * When running in a virtual context, the timestamp register is unreadable
+     * on Gfx12+.
+     */
+    bool                                        has_reg_timestamp;
+
+    /** True if this device has implicit AUX
+     *
+     * If true, CCS is handled as an implicit attachment to the BO rather than
+     * as an explicitly bound surface.
+     */
+    bool                                        has_implicit_ccs;
+
+    bool                                        always_flush_cache;
+
+    struct {
+      uint32_t                                  family_count;
+      struct anv_queue_family                   families[ANV_MAX_QUEUE_FAMILIES];
+    } queue;
+
+    struct {
+      uint32_t                                  type_count;
+      struct anv_memory_type                    types[VK_MAX_MEMORY_TYPES];
+      uint32_t                                  heap_count;
+      struct anv_memory_heap                    heaps[VK_MAX_MEMORY_HEAPS];
+      bool                                      need_clflush;
+    } memory;
+
+    /* Either we have a single vram region and it's all mappable, or we have
+     * both mappable & non-mappable parts. System memory is always available.
+     */
+    struct anv_memregion                        vram_mappable;
+    struct anv_memregion                        vram_non_mappable;
+    struct anv_memregion                        sys;
+    uint8_t                                     driver_build_sha1[20];
+    uint8_t                                     pipeline_cache_uuid[VK_UUID_SIZE];
+    uint8_t                                     driver_uuid[VK_UUID_SIZE];
+    uint8_t                                     device_uuid[VK_UUID_SIZE];
+
+    struct vk_sync_type                         sync_syncobj_type;
+    struct vk_sync_timeline_type                sync_timeline_type;
+    const struct vk_sync_type *                 sync_types[4];
+
+    struct wsi_device                       wsi_device;
+    int                                         local_fd;
+    bool                                        has_local;
+    int64_t                                     local_major;
+    int64_t                                     local_minor;
+    int                                         master_fd;
+    bool                                        has_master;
+    int64_t                                     master_major;
+    int64_t                                     master_minor;
+    struct drm_i915_query_engine_info *         engine_info;
+
+    void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, bool);
+    struct intel_measure_device                 measure_device;
+};
+
+static inline bool
+anv_physical_device_has_vram(const struct anv_physical_device *device)
+{
+   return device->vram_mappable.size > 0;
+}
+
+struct anv_instance {
+    struct vk_instance                          vk;
+
+    struct driOptionCache                       dri_options;
+    struct driOptionCache                       available_dri_options;
+
+    /**
+     * Workarounds for game bugs.
+     */
+    bool                                        assume_full_subgroups;
+    bool                                        limit_trig_input_range;
+    bool                                        sample_mask_out_opengl_behaviour;
+};
+
+VkResult anv_init_wsi(struct anv_physical_device *physical_device);
+void anv_finish_wsi(struct anv_physical_device *physical_device);
+
+struct anv_queue {
+   struct vk_queue                           vk;
+
+   struct anv_device *                       device;
+
+   const struct anv_queue_family *           family;
+
+   uint32_t                                  index_in_family;
+
+   uint32_t                                  exec_flags;
+
+   /** Synchronization object for debug purposes (DEBUG_SYNC) */
+   struct vk_sync                           *sync;
+
+   struct intel_ds_queue *                   ds;
+};
+
+struct nir_xfb_info;
+struct anv_pipeline_bind_map;
+
+extern const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2];
+
+struct anv_shader_bin *
+anv_device_search_for_kernel(struct anv_device *device,
+                             struct vk_pipeline_cache *cache,
+                             const void *key_data, uint32_t key_size,
+                             bool *user_cache_bit);
+
+struct anv_shader_bin *
+anv_device_upload_kernel(struct anv_device *device,
+                         struct vk_pipeline_cache *cache,
+                         gl_shader_stage stage,
+                         const void *key_data, uint32_t key_size,
+                         const void *kernel_data, uint32_t kernel_size,
+                         const struct brw_stage_prog_data *prog_data,
+                         uint32_t prog_data_size,
+                         const struct brw_compile_stats *stats,
+                         uint32_t num_stats,
+                         const struct nir_xfb_info *xfb_info,
+                         const struct anv_pipeline_bind_map *bind_map);
+
+struct nir_shader;
+struct nir_shader_compiler_options;
+
+struct nir_shader *
+anv_device_search_for_nir(struct anv_device *device,
+                          struct vk_pipeline_cache *cache,
+                          const struct nir_shader_compiler_options *nir_options,
+                          unsigned char sha1_key[20],
+                          void *mem_ctx);
+
+void
+anv_device_upload_nir(struct anv_device *device,
+                      struct vk_pipeline_cache *cache,
+                      const struct nir_shader *nir,
+                      unsigned char sha1_key[20]);
+
+struct anv_device {
+    struct vk_device                            vk;
+
+    struct anv_physical_device *                physical;
+    const struct intel_device_info *            info;
+    struct isl_device                           isl_dev;
+    int                                         context_id;
+    int                                         fd;
+    bool                                        can_chain_batches;
+    bool                                        robust_buffer_access;
+
+    pthread_mutex_t                             vma_mutex;
+    struct util_vma_heap                        vma_lo;
+    struct util_vma_heap                        vma_cva;
+    struct util_vma_heap                        vma_hi;
+
+    /** List of all anv_device_memory objects */
+    struct list_head                            memory_objects;
+
+    struct anv_bo_pool                          batch_bo_pool;
+    struct anv_bo_pool                          utrace_bo_pool;
+
+    struct anv_bo_cache                         bo_cache;
+
+    struct anv_state_pool                       general_state_pool;
+    struct anv_state_pool                       dynamic_state_pool;
+    struct anv_state_pool                       instruction_state_pool;
+    struct anv_state_pool                       binding_table_pool;
+    struct anv_state_pool                       surface_state_pool;
+
+    struct anv_state_reserved_pool              custom_border_colors;
+
+    /** BO used for various workarounds
+     *
+     * There are a number of workarounds on our hardware which require writing
+     * data somewhere and it doesn't really matter where.  For that, we use
+     * this BO and just write to the first dword or so.
+     *
+     * We also need to be able to handle NULL buffers bound as pushed UBOs.
+     * For that, we use the high bytes (>= 1024) of the workaround BO.
+     */
+    struct anv_bo *                             workaround_bo;
+    struct anv_address                          workaround_address;
+
+    struct anv_bo *                             trivial_batch_bo;
+    struct anv_state                            null_surface_state;
+
+    struct vk_pipeline_cache *                  default_pipeline_cache;
+    struct vk_pipeline_cache *                  internal_cache;
+    struct blorp_context                        blorp;
+
+    struct anv_state                            border_colors;
+
+    struct anv_state                            slice_hash;
+
+    /** An array of CPS_STATE structures grouped by MAX_VIEWPORTS elements
+     *
+     * We need to emit CPS_STATE structures for each viewport accessible by a
+     * pipeline. So rather than write many identical CPS_STATE structures
+     * dynamically, we can enumerate all possible combinaisons and then just
+     * emit a 3DSTATE_CPS_POINTERS instruction with the right offset into this
+     * array.
+     */
+    struct anv_state                            cps_states;
+
+    uint32_t                                    queue_count;
+    struct anv_queue  *                         queues;
+
+    struct anv_scratch_pool                     scratch_pool;
+    struct anv_bo                              *rt_scratch_bos[16];
+
+    /** Shadow ray query BO
+     *
+     * The ray_query_bo only holds the current ray being traced. When using
+     * more than 1 ray query per thread, we cannot fit all the queries in
+     * there, so we need a another buffer to hold query data that is not
+     * currently being used by the HW for tracing, similar to a scratch space.
+     *
+     * The size of the shadow buffer depends on the number of queries per
+     * shader.
+     */
+    struct anv_bo                              *ray_query_shadow_bos[16];
+    /** Ray query buffer used to communicated with HW unit.
+     */
+    struct anv_bo                              *ray_query_bo;
+
+    struct anv_shader_bin                      *rt_trampoline;
+    struct anv_shader_bin                      *rt_trivial_return;
+
+    pthread_mutex_t                             mutex;
+    pthread_cond_t                              queue_submit;
+
+    struct intel_batch_decode_ctx               decoder_ctx;
+    /*
+     * When decoding a anv_cmd_buffer, we might need to search for BOs through
+     * the cmd_buffer's list.
+     */
+    struct anv_cmd_buffer                      *cmd_buffer_being_decoded;
+
+    int                                         perf_fd; /* -1 if no opened */
+    uint64_t                                    perf_metric; /* 0 if unset */
+
+    struct intel_aux_map_context                *aux_map_ctx;
+
+    const struct intel_l3_config                *l3_config;
+
+    struct intel_debug_block_frame              *debug_frame_desc;
+
+    struct intel_ds_device                       ds;
+};
+
+#if defined(GFX_VERx10) && GFX_VERx10 >= 90
+#define ANV_ALWAYS_SOFTPIN true
+#else
+#define ANV_ALWAYS_SOFTPIN false
+#endif
+
+static inline bool
+anv_use_relocations(const struct anv_physical_device *pdevice)
+{
+#if defined(GFX_VERx10) && GFX_VERx10 >= 90
+   /* Sky Lake and later always uses softpin */
+   assert(!pdevice->use_relocations);
+   return false;
+#elif defined(GFX_VERx10) && GFX_VERx10 < 80
+   /* Haswell and earlier never use softpin */
+   assert(pdevice->use_relocations);
+   return true;
+#else
+   /* If we don't have a GFX_VERx10 #define, we need to look at the physical
+    * device.  Also, for GFX version 8, we need to look at the physical
+    * device because Broadwell softpins but Cherryview doesn't.
+    */
+   return pdevice->use_relocations;
+#endif
+}
+
+static inline struct anv_state_pool *
+anv_binding_table_pool(struct anv_device *device)
+{
+   if (anv_use_relocations(device->physical))
+      return &device->surface_state_pool;
+   else
+      return &device->binding_table_pool;
+}
+
+static inline struct anv_state
+anv_binding_table_pool_alloc(struct anv_device *device)
+{
+   if (anv_use_relocations(device->physical))
+      return anv_state_pool_alloc_back(&device->surface_state_pool);
+   else
+      return anv_state_pool_alloc(&device->binding_table_pool,
+                                  device->binding_table_pool.block_size, 0);
+}
+
+static inline void
+anv_binding_table_pool_free(struct anv_device *device, struct anv_state state) {
+   anv_state_pool_free(anv_binding_table_pool(device), state);
+}
+
+static inline uint32_t
+anv_mocs(const struct anv_device *device,
+         const struct anv_bo *bo,
+         isl_surf_usage_flags_t usage)
+{
+   return isl_mocs(&device->isl_dev, usage, bo && bo->is_external);
+}
+
+void anv_device_init_blorp(struct anv_device *device);
+void anv_device_finish_blorp(struct anv_device *device);
+
+enum anv_bo_alloc_flags {
+   /** Specifies that the BO must have a 32-bit address
+    *
+    * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS.
+    */
+   ANV_BO_ALLOC_32BIT_ADDRESS =  (1 << 0),
+
+   /** Specifies that the BO may be shared externally */
+   ANV_BO_ALLOC_EXTERNAL =       (1 << 1),
+
+   /** Specifies that the BO should be mapped */
+   ANV_BO_ALLOC_MAPPED =         (1 << 2),
+
+   /** Specifies that the BO should be snooped so we get coherency */
+   ANV_BO_ALLOC_SNOOPED =        (1 << 3),
+
+   /** Specifies that the BO should be captured in error states */
+   ANV_BO_ALLOC_CAPTURE =        (1 << 4),
+
+   /** Specifies that the BO will have an address assigned by the caller
+    *
+    * Such BOs do not exist in any VMA heap.
+    */
+   ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5),
+
+   /** Enables implicit synchronization on the BO
+    *
+    * This is the opposite of EXEC_OBJECT_ASYNC.
+    */
+   ANV_BO_ALLOC_IMPLICIT_SYNC =  (1 << 6),
+
+   /** Enables implicit synchronization on the BO
+    *
+    * This is equivalent to EXEC_OBJECT_WRITE.
+    */
+   ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7),
+
+   /** Has an address which is visible to the client */
+   ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8),
+
+   /** This buffer has implicit CCS data attached to it */
+   ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9),
+
+   /** This buffer is allocated from local memory and should be cpu visible */
+   ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE = (1 << 10),
+};
+
+VkResult anv_device_alloc_bo(struct anv_device *device,
+                             const char *name, uint64_t size,
+                             enum anv_bo_alloc_flags alloc_flags,
+                             uint64_t explicit_address,
+                             struct anv_bo **bo);
+VkResult anv_device_map_bo(struct anv_device *device,
+                           struct anv_bo *bo,
+                           uint64_t offset,
+                           size_t size,
+                           uint32_t gem_flags,
+                           void **map_out);
+void anv_device_unmap_bo(struct anv_device *device,
+                         struct anv_bo *bo,
+                         void *map, size_t map_size);
+VkResult anv_device_import_bo_from_host_ptr(struct anv_device *device,
+                                            void *host_ptr, uint32_t size,
+                                            enum anv_bo_alloc_flags alloc_flags,
+                                            uint64_t client_address,
+                                            struct anv_bo **bo_out);
+VkResult anv_device_import_bo(struct anv_device *device, int fd,
+                              enum anv_bo_alloc_flags alloc_flags,
+                              uint64_t client_address,
+                              struct anv_bo **bo);
+VkResult anv_device_export_bo(struct anv_device *device,
+                              struct anv_bo *bo, int *fd_out);
+VkResult anv_device_get_bo_tiling(struct anv_device *device,
+                                  struct anv_bo *bo,
+                                  enum isl_tiling *tiling_out);
+VkResult anv_device_set_bo_tiling(struct anv_device *device,
+                                  struct anv_bo *bo,
+                                  uint32_t row_pitch_B,
+                                  enum isl_tiling tiling);
+void anv_device_release_bo(struct anv_device *device,
+                           struct anv_bo *bo);
+
+static inline void anv_device_set_physical(struct anv_device *device,
+                                           struct anv_physical_device *physical_device)
+{
+   device->physical = physical_device;
+   device->info = &physical_device->info;
+   device->isl_dev = physical_device->isl_dev;
+}
+
+static inline struct anv_bo *
+anv_device_lookup_bo(struct anv_device *device, uint32_t gem_handle)
+{
+   return util_sparse_array_get(&device->bo_cache.bo_map, gem_handle);
+}
+
+VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                         int64_t timeout);
+
+VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue,
+                        uint32_t exec_flags,
+                        const VkDeviceQueueCreateInfo *pCreateInfo,
+                        uint32_t index_in_family);
+void anv_queue_finish(struct anv_queue *queue);
+
+VkResult anv_queue_submit(struct vk_queue *queue,
+                          struct vk_queue_submit *submit);
+VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
+                                       struct anv_batch *batch);
+
+void* anv_gem_mmap(struct anv_device *device,
+                   uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
+void anv_gem_munmap(struct anv_device *device, void *p, uint64_t size);
+uint32_t anv_gem_create(struct anv_device *device, uint64_t size);
+void anv_gem_close(struct anv_device *device, uint32_t gem_handle);
+uint32_t anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
+                                uint32_t flags, uint32_t num_regions,
+                                struct drm_i915_gem_memory_class_instance *regions);
+uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size);
+int anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns);
+int anv_gem_execbuffer(struct anv_device *device,
+                       struct drm_i915_gem_execbuffer2 *execbuf);
+int anv_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+                       uint32_t stride, uint32_t tiling);
+int anv_gem_create_context(struct anv_device *device);
+bool anv_gem_has_context_priority(int fd, int priority);
+int anv_gem_destroy_context(struct anv_device *device, int context);
+int anv_gem_set_context_param(int fd, int context, uint32_t param,
+                              uint64_t value);
+int anv_gem_get_param(int fd, uint32_t param);
+int anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
+int anv_gem_context_get_reset_stats(int fd, int context,
+                                    uint32_t *active, uint32_t *pending);
+int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
+int anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result);
+uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
+int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
+int anv_i915_query(int fd, uint64_t query_id, void *buffer,
+                   int32_t *buffer_len);
+struct drm_i915_query_engine_info *anv_gem_get_engine_info(int fd);
+
+uint64_t anv_vma_alloc(struct anv_device *device,
+                       uint64_t size, uint64_t align,
+                       enum anv_bo_alloc_flags alloc_flags,
+                       uint64_t client_address);
+void anv_vma_free(struct anv_device *device,
+                  uint64_t address, uint64_t size);
+
+struct anv_reloc_list {
+   uint32_t                                     num_relocs;
+   uint32_t                                     array_length;
+   struct drm_i915_gem_relocation_entry *       relocs;
+   struct anv_bo **                             reloc_bos;
+   uint32_t                                     dep_words;
+   BITSET_WORD *                                deps;
+};
+
+VkResult anv_reloc_list_init(struct anv_reloc_list *list,
+                             const VkAllocationCallbacks *alloc);
+void anv_reloc_list_finish(struct anv_reloc_list *list,
+                           const VkAllocationCallbacks *alloc);
+
+VkResult anv_reloc_list_add(struct anv_reloc_list *list,
+                            const VkAllocationCallbacks *alloc,
+                            uint32_t offset, struct anv_bo *target_bo,
+                            uint32_t delta, uint64_t *address_u64_out);
+
+VkResult anv_reloc_list_add_bo(struct anv_reloc_list *list,
+                               const VkAllocationCallbacks *alloc,
+                               struct anv_bo *target_bo);
+
+struct anv_batch_bo {
+   /* Link in the anv_cmd_buffer.owned_batch_bos list */
+   struct list_head                             link;
+
+   struct anv_bo *                              bo;
+
+   /* Bytes actually consumed in this batch BO */
+   uint32_t                                     length;
+
+   /* When this batch BO is used as part of a primary batch buffer, this
+    * tracked whether it is chained to another primary batch buffer.
+    *
+    * If this is the case, the relocation list's last entry points the
+    * location of the MI_BATCH_BUFFER_START chaining to the next batch.
+    */
+   bool                                         chained;
+
+   struct anv_reloc_list                        relocs;
+};
+
+struct anv_batch {
+   const VkAllocationCallbacks *                alloc;
+
+   struct anv_address                           start_addr;
+
+   void *                                       start;
+   void *                                       end;
+   void *                                       next;
+
+   struct anv_reloc_list *                      relocs;
+
+   /* This callback is called (with the associated user data) in the event
+    * that the batch runs out of space.
+    */
+   VkResult (*extend_cb)(struct anv_batch *, void *);
+   void *                                       user_data;
+
+   /**
+    * Current error status of the command buffer. Used to track inconsistent
+    * or incomplete command buffer states that are the consequence of run-time
+    * errors such as out of memory scenarios. We want to track this in the
+    * batch because the command buffer object is not visible to some parts
+    * of the driver.
+    */
+   VkResult                                     status;
+};
+
+void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords);
+void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other);
+struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location);
+
+static inline void
+anv_batch_set_storage(struct anv_batch *batch, struct anv_address addr,
+                      void *map, size_t size)
+{
+   batch->start_addr = addr;
+   batch->next = batch->start = map;
+   batch->end = map + size;
+}
+
+static inline VkResult
+anv_batch_set_error(struct anv_batch *batch, VkResult error)
+{
+   assert(error != VK_SUCCESS);
+   if (batch->status == VK_SUCCESS)
+      batch->status = error;
+   return batch->status;
+}
+
+static inline bool
+anv_batch_has_error(struct anv_batch *batch)
+{
+   return batch->status != VK_SUCCESS;
+}
+
+static inline uint64_t
+anv_batch_emit_reloc(struct anv_batch *batch,
+                     void *location, struct anv_bo *bo, uint32_t delta)
+{
+   uint64_t address_u64 = 0;
+   VkResult result;
+
+   if (ANV_ALWAYS_SOFTPIN) {
+      address_u64 = bo->offset + delta;
+      result = anv_reloc_list_add_bo(batch->relocs, batch->alloc, bo);
+   } else {
+      result = anv_reloc_list_add(batch->relocs, batch->alloc,
+                                  location - batch->start, bo, delta,
+                                  &address_u64);
+   }
+   if (unlikely(result != VK_SUCCESS)) {
+      anv_batch_set_error(batch, result);
+      return 0;
+   }
+
+   return address_u64;
+}
+
+static inline void
+write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
+{
+   unsigned reloc_size = 0;
+   if (device->info->ver >= 8) {
+      reloc_size = sizeof(uint64_t);
+      *(uint64_t *)p = intel_canonical_address(v);
+   } else {
+      reloc_size = sizeof(uint32_t);
+      *(uint32_t *)p = v;
+   }
+
+   if (flush && device->physical->memory.need_clflush)
+      intel_flush_range(p, reloc_size);
+}
+
+static inline uint64_t
+_anv_combine_address(struct anv_batch *batch, void *location,
+                     const struct anv_address address, uint32_t delta)
+{
+   if (address.bo == NULL) {
+      return address.offset + delta;
+   } else if (batch == NULL) {
+      assert(anv_bo_is_pinned(address.bo));
+      return anv_address_physical(anv_address_add(address, delta));
+   } else {
+      assert(batch->start <= location && location < batch->end);
+      /* i915 relocations are signed. */
+      assert(INT32_MIN <= address.offset && address.offset <= INT32_MAX);
+      return anv_batch_emit_reloc(batch, location, address.bo, address.offset + delta);
+   }
+}
+
+#define __gen_address_type struct anv_address
+#define __gen_user_data struct anv_batch
+#define __gen_combine_address _anv_combine_address
+
+/* Wrapper macros needed to work around preprocessor argument issues.  In
+ * particular, arguments don't get pre-evaluated if they are concatenated.
+ * This means that, if you pass GENX(3DSTATE_PS) into the emit macro, the
+ * GENX macro won't get evaluated if the emit macro contains "cmd ## foo".
+ * We can work around this easily enough with these helpers.
+ */
+#define __anv_cmd_length(cmd) cmd ## _length
+#define __anv_cmd_length_bias(cmd) cmd ## _length_bias
+#define __anv_cmd_header(cmd) cmd ## _header
+#define __anv_cmd_pack(cmd) cmd ## _pack
+#define __anv_reg_num(reg) reg ## _num
+
+#define anv_pack_struct(dst, struc, ...) do {                              \
+      struct struc __template = {                                          \
+         __VA_ARGS__                                                       \
+      };                                                                   \
+      __anv_cmd_pack(struc)(NULL, dst, &__template);                       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dst, __anv_cmd_length(struc) * 4)); \
+   } while (0)
+
+#define anv_batch_emitn(batch, n, cmd, ...) ({             \
+      void *__dst = anv_batch_emit_dwords(batch, n);       \
+      if (__dst) {                                         \
+         struct cmd __template = {                         \
+            __anv_cmd_header(cmd),                         \
+           .DWordLength = n - __anv_cmd_length_bias(cmd),  \
+            __VA_ARGS__                                    \
+         };                                                \
+         __anv_cmd_pack(cmd)(batch, __dst, &__template);   \
+      }                                                    \
+      __dst;                                               \
+   })
+
+#define anv_batch_emit_merge(batch, dwords0, dwords1)                   \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      STATIC_ASSERT(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1));        \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] | (dwords1)[i];                           \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));\
+   } while (0)
+
+#define anv_batch_emit(batch, cmd, name)                            \
+   for (struct cmd name = { __anv_cmd_header(cmd) },                    \
+        *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ __anv_cmd_pack(cmd)(batch, _dst, &name);                     \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+         }))
+
+#define anv_batch_write_reg(batch, reg, name)                           \
+   for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL;  \
+        ({                                                              \
+            uint32_t _dw[__anv_cmd_length(reg)];                        \
+            __anv_cmd_pack(reg)(NULL, _dw, &name);                      \
+            for (unsigned i = 0; i < __anv_cmd_length(reg); i++) {      \
+               anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { \
+                  lri.RegisterOffset   = __anv_reg_num(reg);            \
+                  lri.DataDWord        = _dw[i];                        \
+               }                                                        \
+            }                                                           \
+           _cont = NULL;                                                \
+         }))
+
+/* #define __gen_get_batch_dwords anv_batch_emit_dwords */
+/* #define __gen_get_batch_address anv_batch_address */
+/* #define __gen_address_value anv_address_physical */
+/* #define __gen_address_offset anv_address_add */
+
+struct anv_device_memory {
+   struct vk_object_base                        base;
+
+   struct list_head                             link;
+
+   struct anv_bo *                              bo;
+   const struct anv_memory_type *               type;
+
+   void *                                       map;
+   size_t                                       map_size;
+
+   /* The map, from the user PoV is map + map_delta */
+   uint64_t                                     map_delta;
+
+   /* If set, we are holding reference to AHardwareBuffer
+    * which we must release when memory is freed.
+    */
+   struct AHardwareBuffer *                     ahw;
+
+   /* If set, this memory comes from a host pointer. */
+   void *                                       host_ptr;
+};
+
+/**
+ * Header for Vertex URB Entry (VUE)
+ */
+struct anv_vue_header {
+   uint32_t Reserved;
+   uint32_t RTAIndex; /* RenderTargetArrayIndex */
+   uint32_t ViewportIndex;
+   float PointWidth;
+};
+
+/** Struct representing a sampled image descriptor
+ *
+ * This descriptor layout is used for sampled images, bare sampler, and
+ * combined image/sampler descriptors.
+ */
+struct anv_sampled_image_descriptor {
+   /** Bindless image handle
+    *
+    * This is expected to already be shifted such that the 20-bit
+    * SURFACE_STATE table index is in the top 20 bits.
+    */
+   uint32_t image;
+
+   /** Bindless sampler handle
+    *
+    * This is assumed to be a 32B-aligned SAMPLER_STATE pointer relative
+    * to the dynamic state base address.
+    */
+   uint32_t sampler;
+};
+
+struct anv_texture_swizzle_descriptor {
+   /** Texture swizzle
+    *
+    * See also nir_intrinsic_channel_select_intel
+    */
+   uint8_t swizzle[4];
+
+   /** Unused padding to ensure the struct is a multiple of 64 bits */
+   uint32_t _pad;
+};
+
+/** Struct representing a storage image descriptor */
+struct anv_storage_image_descriptor {
+   /** Bindless image handles
+    *
+    * These are expected to already be shifted such that the 20-bit
+    * SURFACE_STATE table index is in the top 20 bits.
+    */
+   uint32_t vanilla;
+   uint32_t lowered;
+};
+
+/** Struct representing a address/range descriptor
+ *
+ * The fields of this struct correspond directly to the data layout of
+ * nir_address_format_64bit_bounded_global addresses.  The last field is the
+ * offset in the NIR address so it must be zero so that when you load the
+ * descriptor you get a pointer to the start of the range.
+ */
+struct anv_address_range_descriptor {
+   uint64_t address;
+   uint32_t range;
+   uint32_t zero;
+};
+
+enum anv_descriptor_data {
+   /** The descriptor contains a BTI reference to a surface state */
+   ANV_DESCRIPTOR_SURFACE_STATE  = (1 << 0),
+   /** The descriptor contains a BTI reference to a sampler state */
+   ANV_DESCRIPTOR_SAMPLER_STATE  = (1 << 1),
+   /** The descriptor contains an actual buffer view */
+   ANV_DESCRIPTOR_BUFFER_VIEW    = (1 << 2),
+   /** The descriptor contains auxiliary image layout data */
+   ANV_DESCRIPTOR_IMAGE_PARAM    = (1 << 3),
+   /** The descriptor contains auxiliary image layout data */
+   ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4),
+   /** anv_address_range_descriptor with a buffer address and range */
+   ANV_DESCRIPTOR_ADDRESS_RANGE  = (1 << 5),
+   /** Bindless surface handle */
+   ANV_DESCRIPTOR_SAMPLED_IMAGE  = (1 << 6),
+   /** Storage image handles */
+   ANV_DESCRIPTOR_STORAGE_IMAGE  = (1 << 7),
+   /** Storage image handles */
+   ANV_DESCRIPTOR_TEXTURE_SWIZZLE  = (1 << 8),
+};
+
+struct anv_descriptor_set_binding_layout {
+   /* The type of the descriptors in this binding */
+   VkDescriptorType type;
+
+   /* Flags provided when this binding was created */
+   VkDescriptorBindingFlags flags;
+
+   /* Bitfield representing the type of data this descriptor contains */
+   enum anv_descriptor_data data;
+
+   /* Maximum number of YCbCr texture/sampler planes */
+   uint8_t max_plane_count;
+
+   /* Number of array elements in this binding (or size in bytes for inline
+    * uniform data)
+    */
+   uint32_t array_size;
+
+   /* Index into the flattened descriptor set */
+   uint32_t descriptor_index;
+
+   /* Index into the dynamic state array for a dynamic buffer */
+   int16_t dynamic_offset_index;
+
+   /* Index into the descriptor set buffer views */
+   int32_t buffer_view_index;
+
+   /* Offset into the descriptor buffer where this descriptor lives */
+   uint32_t descriptor_offset;
+
+   /* Pre computed stride */
+   unsigned descriptor_stride;
+
+   /* Immutable samplers (or NULL if no immutable samplers) */
+   struct anv_sampler **immutable_samplers;
+};
+
+bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
+                                      const struct anv_descriptor_set_binding_layout *binding,
+                                      bool sampler);
+
+bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
+                                      const struct anv_descriptor_set_binding_layout *binding,
+                                      bool sampler);
+
+struct anv_descriptor_set_layout {
+   struct vk_object_base base;
+
+   /* Descriptor set layouts can be destroyed at almost any time */
+   uint32_t ref_cnt;
+
+   /* Number of bindings in this descriptor set */
+   uint32_t binding_count;
+
+   /* Total number of descriptors */
+   uint32_t descriptor_count;
+
+   /* Shader stages affected by this descriptor set */
+   uint16_t shader_stages;
+
+   /* Number of buffer views in this descriptor set */
+   uint32_t buffer_view_count;
+
+   /* Number of dynamic offsets used by this descriptor set */
+   uint16_t dynamic_offset_count;
+
+   /* For each dynamic buffer, which VkShaderStageFlagBits stages are using
+    * this buffer
+    */
+   VkShaderStageFlags dynamic_offset_stages[MAX_DYNAMIC_BUFFERS];
+
+   /* Size of the descriptor buffer for this descriptor set */
+   uint32_t descriptor_buffer_size;
+
+   /* Bindings in this descriptor set */
+   struct anv_descriptor_set_binding_layout binding[0];
+};
+
+void anv_descriptor_set_layout_destroy(struct anv_device *device,
+                                       struct anv_descriptor_set_layout *layout);
+
+static inline void
+anv_descriptor_set_layout_ref(struct anv_descriptor_set_layout *layout)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   p_atomic_inc(&layout->ref_cnt);
+}
+
+static inline void
+anv_descriptor_set_layout_unref(struct anv_device *device,
+                                struct anv_descriptor_set_layout *layout)
+{
+   assert(layout && layout->ref_cnt >= 1);
+   if (p_atomic_dec_zero(&layout->ref_cnt))
+      anv_descriptor_set_layout_destroy(device, layout);
+}
+
+struct anv_descriptor {
+   VkDescriptorType type;
+
+   union {
+      struct {
+         VkImageLayout layout;
+         struct anv_image_view *image_view;
+         struct anv_sampler *sampler;
+      };
+
+      struct {
+         struct anv_buffer_view *set_buffer_view;
+         struct anv_buffer *buffer;
+         uint64_t offset;
+         uint64_t range;
+      };
+
+      struct anv_buffer_view *buffer_view;
+
+      struct anv_acceleration_structure *accel_struct;
+   };
+};
+
+struct anv_descriptor_set {
+   struct vk_object_base base;
+
+   struct anv_descriptor_pool *pool;
+   struct anv_descriptor_set_layout *layout;
+
+   /* Amount of space occupied in the the pool by this descriptor set. It can
+    * be larger than the size of the descriptor set.
+    */
+   uint32_t size;
+
+   /* State relative to anv_descriptor_pool::bo */
+   struct anv_state desc_mem;
+   /* Surface state for the descriptor buffer */
+   struct anv_state desc_surface_state;
+
+   /* Descriptor set address. */
+   struct anv_address desc_addr;
+
+   uint32_t buffer_view_count;
+   struct anv_buffer_view *buffer_views;
+
+   /* Link to descriptor pool's desc_sets list . */
+   struct list_head pool_link;
+
+   uint32_t descriptor_count;
+   struct anv_descriptor descriptors[0];
+};
+
+static inline bool
+anv_descriptor_set_is_push(struct anv_descriptor_set *set)
+{
+   return set->pool == NULL;
+}
+
+struct anv_buffer_view {
+   struct vk_object_base base;
+
+   uint64_t range; /**< VkBufferViewCreateInfo::range */
+
+   struct anv_address address;
+
+   struct anv_state surface_state;
+   struct anv_state storage_surface_state;
+   struct anv_state lowered_storage_surface_state;
+
+   struct brw_image_param lowered_storage_image_param;
+};
+
+struct anv_push_descriptor_set {
+   struct anv_descriptor_set set;
+
+   /* Put this field right behind anv_descriptor_set so it fills up the
+    * descriptors[0] field. */
+   struct anv_descriptor descriptors[MAX_PUSH_DESCRIPTORS];
+
+   /** True if the descriptor set buffer has been referenced by a draw or
+    * dispatch command.
+    */
+   bool set_used_on_gpu;
+
+   struct anv_buffer_view buffer_views[MAX_PUSH_DESCRIPTORS];
+};
+
+static inline struct anv_address
+anv_descriptor_set_address(struct anv_descriptor_set *set)
+{
+   if (anv_descriptor_set_is_push(set)) {
+      /* We have to flag push descriptor set as used on the GPU
+       * so that the next time we push descriptors, we grab a new memory.
+       */
+      struct anv_push_descriptor_set *push_set =
+         (struct anv_push_descriptor_set *)set;
+      push_set->set_used_on_gpu = true;
+   }
+
+   return set->desc_addr;
+}
+
+struct anv_descriptor_pool {
+   struct vk_object_base base;
+
+   uint32_t size;
+   uint32_t next;
+   uint32_t free_list;
+
+   struct anv_bo *bo;
+   struct util_vma_heap bo_heap;
+
+   struct anv_state_stream surface_state_stream;
+   void *surface_state_free_list;
+
+   struct list_head desc_sets;
+
+   bool host_only;
+
+   char data[0];
+};
+
+struct anv_descriptor_template_entry {
+   /* The type of descriptor in this entry */
+   VkDescriptorType type;
+
+   /* Binding in the descriptor set */
+   uint32_t binding;
+
+   /* Offset at which to write into the descriptor set binding */
+   uint32_t array_element;
+
+   /* Number of elements to write into the descriptor set binding */
+   uint32_t array_count;
+
+   /* Offset into the user provided data */
+   size_t offset;
+
+   /* Stride between elements into the user provided data */
+   size_t stride;
+};
+
+struct anv_descriptor_update_template {
+    struct vk_object_base base;
+
+    VkPipelineBindPoint bind_point;
+
+   /* The descriptor set this template corresponds to. This value is only
+    * valid if the template was created with the templateType
+    * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
+    */
+   uint8_t set;
+
+   /* Number of entries in this template */
+   uint32_t entry_count;
+
+   /* Entries of the template */
+   struct anv_descriptor_template_entry entries[0];
+};
+
+size_t
+anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
+                               uint32_t var_desc_count);
+
+uint32_t
+anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
+                                                 uint32_t var_desc_count);
+
+void
+anv_descriptor_set_write_image_view(struct anv_device *device,
+                                    struct anv_descriptor_set *set,
+                                    const VkDescriptorImageInfo * const info,
+                                    VkDescriptorType type,
+                                    uint32_t binding,
+                                    uint32_t element);
+
+void
+anv_descriptor_set_write_buffer_view(struct anv_device *device,
+                                     struct anv_descriptor_set *set,
+                                     VkDescriptorType type,
+                                     struct anv_buffer_view *buffer_view,
+                                     uint32_t binding,
+                                     uint32_t element);
+
+void
+anv_descriptor_set_write_buffer(struct anv_device *device,
+                                struct anv_descriptor_set *set,
+                                struct anv_state_stream *alloc_stream,
+                                VkDescriptorType type,
+                                struct anv_buffer *buffer,
+                                uint32_t binding,
+                                uint32_t element,
+                                VkDeviceSize offset,
+                                VkDeviceSize range);
+
+void
+anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
+                                                struct anv_descriptor_set *set,
+                                                struct anv_acceleration_structure *accel,
+                                                uint32_t binding,
+                                                uint32_t element);
+
+void
+anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
+                                             struct anv_descriptor_set *set,
+                                             uint32_t binding,
+                                             const void *data,
+                                             size_t offset,
+                                             size_t size);
+
+void
+anv_descriptor_set_write_template(struct anv_device *device,
+                                  struct anv_descriptor_set *set,
+                                  struct anv_state_stream *alloc_stream,
+                                  const struct anv_descriptor_update_template *template,
+                                  const void *data);
+
+#define ANV_DESCRIPTOR_SET_NULL             (UINT8_MAX - 5)
+#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS   (UINT8_MAX - 4)
+#define ANV_DESCRIPTOR_SET_DESCRIPTORS      (UINT8_MAX - 3)
+#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS  (UINT8_MAX - 2)
+#define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1)
+#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX
+
+struct anv_pipeline_binding {
+   /** Index in the descriptor set
+    *
+    * This is a flattened index; the descriptor set layout is already taken
+    * into account.
+    */
+   uint32_t index;
+
+   /** The descriptor set this surface corresponds to.
+    *
+    * The special ANV_DESCRIPTOR_SET_* values above indicates that this
+    * binding is not a normal descriptor set but something else.
+    */
+   uint8_t set;
+
+   union {
+      /** Plane in the binding index for images */
+      uint8_t plane;
+
+      /** Dynamic offset index (for dynamic UBOs and SSBOs) */
+      uint8_t dynamic_offset_index;
+   };
+
+   /** For a storage image, whether it requires a lowered surface */
+   uint8_t lowered_storage_surface;
+
+   /** Pad to 64 bits so that there are no holes and we can safely memcmp
+    * assuming POD zero-initialization.
+    */
+   uint8_t pad;
+};
+
+struct anv_push_range {
+   /** Index in the descriptor set */
+   uint32_t index;
+
+   /** Descriptor set index */
+   uint8_t set;
+
+   /** Dynamic offset index (for dynamic UBOs) */
+   uint8_t dynamic_offset_index;
+
+   /** Start offset in units of 32B */
+   uint8_t start;
+
+   /** Range in units of 32B */
+   uint8_t length;
+};
+
+struct anv_pipeline_layout {
+   struct vk_object_base base;
+
+   struct {
+      struct anv_descriptor_set_layout *layout;
+      uint32_t dynamic_offset_start;
+   } set[MAX_SETS];
+
+   uint32_t num_sets;
+
+   unsigned char sha1[20];
+};
+
+struct anv_buffer {
+   struct vk_buffer vk;
+
+   /* Set when bound */
+   struct anv_address address;
+};
+
+enum anv_cmd_dirty_bits {
+   ANV_CMD_DIRTY_PIPELINE                            = 1 << 0,
+   ANV_CMD_DIRTY_INDEX_BUFFER                        = 1 << 1,
+   ANV_CMD_DIRTY_RENDER_TARGETS                      = 1 << 2,
+   ANV_CMD_DIRTY_XFB_ENABLE                          = 1 << 3,
+};
+typedef enum anv_cmd_dirty_bits anv_cmd_dirty_mask_t;
+
+enum anv_pipe_bits {
+   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT            = (1 << 0),
+   ANV_PIPE_STALL_AT_SCOREBOARD_BIT          = (1 << 1),
+   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT       = (1 << 2),
+   ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT    = (1 << 3),
+   ANV_PIPE_VF_CACHE_INVALIDATE_BIT          = (1 << 4),
+   ANV_PIPE_DATA_CACHE_FLUSH_BIT             = (1 << 5),
+   ANV_PIPE_TILE_CACHE_FLUSH_BIT             = (1 << 6),
+   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT     = (1 << 10),
+   ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT = (1 << 11),
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT    = (1 << 12),
+   ANV_PIPE_DEPTH_STALL_BIT                  = (1 << 13),
+
+   /* ANV_PIPE_HDC_PIPELINE_FLUSH_BIT is a precise way to ensure prior data
+    * cache work has completed.  Available on Gfx12+.  For earlier Gfx we
+    * must reinterpret this flush as ANV_PIPE_DATA_CACHE_FLUSH_BIT.
+    */
+   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT           = (1 << 14),
+   ANV_PIPE_PSS_STALL_SYNC_BIT               = (1 << 15),
+
+   /*
+    * This bit flush data-port's Untyped L1 data cache (LSC L1).
+    */
+   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT = (1 << 16),
+
+   ANV_PIPE_CS_STALL_BIT                     = (1 << 20),
+   ANV_PIPE_END_OF_PIPE_SYNC_BIT             = (1 << 21),
+
+   /* This bit does not exist directly in PIPE_CONTROL.  Instead it means that
+    * a flush has happened but not a CS stall.  The next time we do any sort
+    * of invalidation we need to insert a CS stall at that time.  Otherwise,
+    * we would have to CS stall on every flush which could be bad.
+    */
+   ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT       = (1 << 22),
+
+   /* This bit does not exist directly in PIPE_CONTROL. It means that render
+    * target operations related to transfer commands with VkBuffer as
+    * destination are ongoing. Some operations like copies on the command
+    * streamer might need to be aware of this to trigger the appropriate stall
+    * before they can proceed with the copy.
+    */
+   ANV_PIPE_RENDER_TARGET_BUFFER_WRITES      = (1 << 23),
+
+   /* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12
+    * AUX-TT data has changed and we need to invalidate AUX-TT data.  This is
+    * done by writing the AUX-TT register.
+    */
+   ANV_PIPE_AUX_TABLE_INVALIDATE_BIT         = (1 << 24),
+
+   /* This bit does not exist directly in PIPE_CONTROL. It means that a
+    * PIPE_CONTROL with a post-sync operation will follow. This is used to
+    * implement a workaround for Gfx9.
+    */
+   ANV_PIPE_POST_SYNC_BIT                    = (1 << 25),
+};
+
+#define ANV_PIPE_FLUSH_BITS ( \
+   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
+   ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT | \
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
+   ANV_PIPE_TILE_CACHE_FLUSH_BIT)
+
+#define ANV_PIPE_STALL_BITS ( \
+   ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
+   ANV_PIPE_DEPTH_STALL_BIT | \
+   ANV_PIPE_CS_STALL_BIT)
+
+#define ANV_PIPE_INVALIDATE_BITS ( \
+   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | \
+   ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)
+
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
+
+static inline enum anv_pipe_bits
+anv_pipe_flush_bits_for_access_flags(struct anv_device *device,
+                                     VkAccessFlags2 flags)
+{
+   enum anv_pipe_bits pipe_bits = 0;
+
+   u_foreach_bit64(b, flags) {
+      switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+      case VK_ACCESS_2_SHADER_WRITE_BIT:
+      case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as write
+          * destination through the data port. To make its content available
+          * to future operations, flush the hdc pipeline.
+          */
+         pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as render
+          * target. To make its content available to future operations, flush
+          * the render target cache.
+          */
+         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as depth
+          * buffer. To make its content available to future operations, flush
+          * the depth cache.
+          */
+         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_TRANSFER_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as a
+          * transfer write destination. Generic write operations include color
+          * & depth operations as well as buffer operations like :
+          *     - vkCmdClearColorImage()
+          *     - vkCmdClearDepthStencilImage()
+          *     - vkCmdBlitImage()
+          *     - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
+          *
+          * Most of these operations are implemented using Blorp which writes
+          * through the render target, so flush that cache to make it visible
+          * to future operations. And for depth related operations we also
+          * need to flush the depth cache.
+          */
+         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_MEMORY_WRITE_BIT:
+         /* We're transitioning a buffer for generic write operations. Flush
+          * all the caches.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_HOST_WRITE_BIT:
+         /* We're transitioning a buffer for access by CPU. Invalidate
+          * all the caches. Since data and tile caches don't have invalidate,
+          * we are forced to flush those as well.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+         break;
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
+         /* We're transitioning a buffer written either from VS stage or from
+          * the command streamer (see CmdEndTransformFeedbackEXT), we just
+          * need to stall the CS.
+          */
+         pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+         break;
+      default:
+         break; /* Nothing to do */
+      }
+   }
+
+   return pipe_bits;
+}
+
+static inline enum anv_pipe_bits
+anv_pipe_invalidate_bits_for_access_flags(struct anv_device *device,
+                                          VkAccessFlags2 flags)
+{
+   enum anv_pipe_bits pipe_bits = 0;
+
+   u_foreach_bit64(b, flags) {
+      switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+      case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
+         /* Indirect draw commands take a buffer as input that we're going to
+          * read from the command streamer to load some of the HW registers
+          * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
+          * command streamer stall so that all the cache flushes have
+          * completed before the command streamer loads from memory.
+          */
+         pipe_bits |=  ANV_PIPE_CS_STALL_BIT;
+         /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
+          * through a vertex buffer, so invalidate that cache.
+          */
+         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
+          * UBO from the buffer, so we need to invalidate constant cache.
+          */
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         /* Tile cache flush needed For CmdDipatchIndirect since command
+          * streamer and vertex fetch aren't L3 coherent.
+          */
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_INDEX_READ_BIT:
+      case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
+         /* We transitioning a buffer to be used for as input for vkCmdDraw*
+          * commands, so we invalidate the VF cache to make sure there is no
+          * stale data when we start rendering.
+          */
+         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         break;
+      case VK_ACCESS_2_UNIFORM_READ_BIT:
+         /* We transitioning a buffer to be used as uniform data. Because
+          * uniform is accessed through the data port & sampler, we need to
+          * invalidate the texture cache (sampler) & constant cache (data
+          * port) to avoid stale data.
+          */
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         if (device->physical->compiler->indirect_ubos_use_sampler) {
+            pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         } else {
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         }
+         break;
+      case VK_ACCESS_2_SHADER_READ_BIT:
+      case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
+      case VK_ACCESS_2_TRANSFER_READ_BIT:
+         /* Transitioning a buffer to be read through the sampler, so
+          * invalidate the texture cache, we don't want any stale data.
+          */
+         pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         break;
+      case VK_ACCESS_2_MEMORY_READ_BIT:
+         /* Transitioning a buffer for generic read, invalidate all the
+          * caches.
+          */
+         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+         break;
+      case VK_ACCESS_2_MEMORY_WRITE_BIT:
+         /* Generic write, make sure all previously written things land in
+          * memory.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
+         /* Transitioning a buffer for conditional rendering or transform
+          * feedback. We'll load the content of this buffer into HW registers
+          * using the command streamer, so we need to stall the command
+          * streamer , so we need to stall the command streamer to make sure
+          * any in-flight flush operations have completed.
+          */
+         pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_HOST_READ_BIT:
+         /* We're transitioning a buffer that was written by CPU.  Flush
+          * all the caches.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+         /* We're transitioning a buffer to be written by the streamout fixed
+          * function. This one is apparently not L3 coherent, so we need a
+          * tile cache flush to make sure any previous write is not going to
+          * create WaW hazards.
+          */
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         break;
+      default:
+         break; /* Nothing to do */
+      }
+   }
+
+   return pipe_bits;
+}
+
+#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV (         \
+   VK_IMAGE_ASPECT_COLOR_BIT | \
+   VK_IMAGE_ASPECT_PLANE_0_BIT | \
+   VK_IMAGE_ASPECT_PLANE_1_BIT | \
+   VK_IMAGE_ASPECT_PLANE_2_BIT)
+#define VK_IMAGE_ASPECT_PLANES_BITS_ANV ( \
+   VK_IMAGE_ASPECT_PLANE_0_BIT | \
+   VK_IMAGE_ASPECT_PLANE_1_BIT | \
+   VK_IMAGE_ASPECT_PLANE_2_BIT)
+
+struct anv_vertex_binding {
+   struct anv_buffer *                          buffer;
+   VkDeviceSize                                 offset;
+   VkDeviceSize                                 size;
+};
+
+struct anv_xfb_binding {
+   struct anv_buffer *                          buffer;
+   VkDeviceSize                                 offset;
+   VkDeviceSize                                 size;
+};
+
+struct anv_push_constants {
+   /** Push constant data provided by the client through vkPushConstants */
+   uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE];
+
+   /** Dynamic offsets for dynamic UBOs and SSBOs */
+   uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
+
+   /* Robust access pushed registers. */
+   uint64_t push_reg_mask[MESA_SHADER_STAGES];
+
+   /** Ray query globals (RT_DISPATCH_GLOBALS) */
+   uint64_t ray_query_globals;
+
+   /* Base addresses for descriptor sets */
+   uint64_t desc_sets[MAX_SETS];
+
+   struct {
+      /** Base workgroup ID
+       *
+       * Used for vkCmdDispatchBase.
+       */
+      uint32_t base_work_group_id[3];
+
+      /** Subgroup ID
+       *
+       * This is never set by software but is implicitly filled out when
+       * uploading the push constants for compute shaders.
+       */
+      uint32_t subgroup_id;
+   } cs;
+};
+
+struct anv_surface_state {
+   struct anv_state state;
+   /** Address of the surface referred to by this state
+    *
+    * This address is relative to the start of the BO.
+    */
+   struct anv_address address;
+   /* Address of the aux surface, if any
+    *
+    * This field is ANV_NULL_ADDRESS if and only if no aux surface exists.
+    *
+    * With the exception of gfx8, the bottom 12 bits of this address' offset
+    * include extra aux information.
+    */
+   struct anv_address aux_address;
+   /* Address of the clear color, if any
+    *
+    * This address is relative to the start of the BO.
+    */
+   struct anv_address clear_address;
+};
+
+struct anv_attachment {
+   VkFormat vk_format;
+   const struct anv_image_view *iview;
+   VkImageLayout layout;
+   enum isl_aux_usage aux_usage;
+   struct anv_surface_state surface_state;
+
+   VkResolveModeFlagBits resolve_mode;
+   const struct anv_image_view *resolve_iview;
+   VkImageLayout resolve_layout;
+};
+
+/** State tracking for vertex buffer flushes
+ *
+ * On Gfx8-9, the VF cache only considers the bottom 32 bits of memory
+ * addresses.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  In order to solve this problem, we track vertex address ranges
+ * which are live in the cache and invalidate the cache if one ever exceeds 32
+ * bits.
+ */
+struct anv_vb_cache_range {
+   /* Virtual address at which the live vertex buffer cache range starts for
+    * this vertex buffer index.
+    */
+   uint64_t start;
+
+   /* Virtual address of the byte after where vertex buffer cache range ends.
+    * This is exclusive such that end - start is the size of the range.
+    */
+   uint64_t end;
+};
+
+/* Check whether we need to apply the Gfx8-9 vertex buffer workaround*/
+static inline bool
+anv_gfx8_9_vb_cache_range_needs_workaround(struct anv_vb_cache_range *bound,
+                                           struct anv_vb_cache_range *dirty,
+                                           struct anv_address vb_address,
+                                           uint32_t vb_size)
+{
+   if (vb_size == 0) {
+      bound->start = 0;
+      bound->end = 0;
+      return false;
+   }
+
+   assert(vb_address.bo && anv_bo_is_pinned(vb_address.bo));
+   bound->start = intel_48b_address(anv_address_physical(vb_address));
+   bound->end = bound->start + vb_size;
+   assert(bound->end > bound->start); /* No overflow */
+
+   /* Align everything to a cache line */
+   bound->start &= ~(64ull - 1ull);
+   bound->end = align_u64(bound->end, 64);
+
+   /* Compute the dirty range */
+   dirty->start = MIN2(dirty->start, bound->start);
+   dirty->end = MAX2(dirty->end, bound->end);
+
+   /* If our range is larger than 32 bits, we have to flush */
+   assert(bound->end - bound->start <= (1ull << 32));
+   return (dirty->end - dirty->start) > (1ull << 32);
+}
+
+/** State tracking for particular pipeline bind point
+ *
+ * This struct is the base struct for anv_cmd_graphics_state and
+ * anv_cmd_compute_state.  These are used to track state which is bound to a
+ * particular type of pipeline.  Generic state that applies per-stage such as
+ * binding table offsets and push constants is tracked generically with a
+ * per-stage array in anv_cmd_state.
+ */
+struct anv_cmd_pipeline_state {
+   struct anv_descriptor_set *descriptors[MAX_SETS];
+   struct anv_push_descriptor_set *push_descriptors[MAX_SETS];
+
+   struct anv_push_constants push_constants;
+
+   /* Push constant state allocated when flushing push constants. */
+   struct anv_state          push_constants_state;
+};
+
+/** State tracking for graphics pipeline
+ *
+ * This has anv_cmd_pipeline_state as a base struct to track things which get
+ * bound to a graphics pipeline.  Along with general pipeline bind point state
+ * which is in the anv_cmd_pipeline_state base struct, it also contains other
+ * state which is graphics-specific.
+ */
+struct anv_cmd_graphics_state {
+   struct anv_cmd_pipeline_state base;
+
+   struct anv_graphics_pipeline *pipeline;
+
+   VkRenderingFlags rendering_flags;
+   VkRect2D render_area;
+   uint32_t layer_count;
+   uint32_t samples;
+   uint32_t view_mask;
+   uint32_t color_att_count;
+   struct anv_state att_states;
+   struct anv_attachment color_att[MAX_RTS];
+   struct anv_attachment depth_att;
+   struct anv_attachment stencil_att;
+   struct anv_state null_surface_state;
+
+   anv_cmd_dirty_mask_t dirty;
+   uint32_t vb_dirty;
+
+   struct anv_vb_cache_range ib_bound_range;
+   struct anv_vb_cache_range ib_dirty_range;
+   struct anv_vb_cache_range vb_bound_ranges[33];
+   struct anv_vb_cache_range vb_dirty_ranges[33];
+
+   uint32_t restart_index;
+
+   VkShaderStageFlags push_constant_stages;
+
+   uint32_t primitive_topology;
+
+   struct anv_buffer *index_buffer;
+   uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
+   uint32_t index_offset;
+
+   struct vk_sample_locations_state sample_locations;
+};
+
+enum anv_depth_reg_mode {
+   ANV_DEPTH_REG_MODE_UNKNOWN = 0,
+   ANV_DEPTH_REG_MODE_HW_DEFAULT,
+   ANV_DEPTH_REG_MODE_D16_1X_MSAA,
+};
+
+/** State tracking for compute pipeline
+ *
+ * This has anv_cmd_pipeline_state as a base struct to track things which get
+ * bound to a compute pipeline.  Along with general pipeline bind point state
+ * which is in the anv_cmd_pipeline_state base struct, it also contains other
+ * state which is compute-specific.
+ */
+struct anv_cmd_compute_state {
+   struct anv_cmd_pipeline_state base;
+
+   struct anv_compute_pipeline *pipeline;
+
+   bool pipeline_dirty;
+
+   struct anv_state push_data;
+
+   struct anv_address num_workgroups;
+};
+
+struct anv_cmd_ray_tracing_state {
+   struct anv_cmd_pipeline_state base;
+
+   struct anv_ray_tracing_pipeline *pipeline;
+
+   bool pipeline_dirty;
+
+   struct {
+      struct anv_bo *bo;
+      struct brw_rt_scratch_layout layout;
+   } scratch;
+};
+
+/** State required while building cmd buffer */
+struct anv_cmd_state {
+   /* PIPELINE_SELECT.PipelineSelection */
+   uint32_t                                     current_pipeline;
+   const struct intel_l3_config *               current_l3_config;
+   uint32_t                                     last_aux_map_state;
+
+   struct anv_cmd_graphics_state                gfx;
+   struct anv_cmd_compute_state                 compute;
+   struct anv_cmd_ray_tracing_state             rt;
+
+   enum anv_pipe_bits                           pending_pipe_bits;
+   VkShaderStageFlags                           descriptors_dirty;
+   VkShaderStageFlags                           push_constants_dirty;
+
+   struct anv_vertex_binding                    vertex_bindings[MAX_VBS];
+   bool                                         xfb_enabled;
+   struct anv_xfb_binding                       xfb_bindings[MAX_XFB_BUFFERS];
+   struct anv_state                             binding_tables[MESA_VULKAN_SHADER_STAGES];
+   struct anv_state                             samplers[MESA_VULKAN_SHADER_STAGES];
+
+   unsigned char                                sampler_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+   unsigned char                                surface_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+   unsigned char                                push_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+
+   /**
+    * Whether or not the gfx8 PMA fix is enabled.  We ensure that, at the top
+    * of any command buffer it is disabled by disabling it in EndCommandBuffer
+    * and before invoking the secondary in ExecuteCommands.
+    */
+   bool                                         pma_fix_enabled;
+
+   /**
+    * Whether or not we know for certain that HiZ is enabled for the current
+    * subpass.  If, for whatever reason, we are unsure as to whether HiZ is
+    * enabled or not, this will be false.
+    */
+   bool                                         hiz_enabled;
+
+   /* We ensure the registers for the gfx12 D16 fix are initialized at the
+    * first non-NULL depth stencil packet emission of every command buffer.
+    * For secondary command buffer execution, we transfer the state from the
+    * last command buffer to the primary (if known).
+    */
+   enum anv_depth_reg_mode                      depth_reg_mode;
+
+   bool                                         conditional_render_enabled;
+
+   /**
+    * Last rendering scale argument provided to
+    * genX(cmd_buffer_emit_hashing_mode)().
+    */
+   unsigned                                     current_hash_scale;
+
+   /**
+    * A buffer used for spill/fill of ray queries.
+    */
+   struct anv_bo *                              ray_query_shadow_bo;
+};
+
+#define ANV_MIN_CMD_BUFFER_BATCH_SIZE 8192
+#define ANV_MAX_CMD_BUFFER_BATCH_SIZE (16 * 1024 * 1024)
+
+enum anv_cmd_buffer_exec_mode {
+   ANV_CMD_BUFFER_EXEC_MODE_PRIMARY,
+   ANV_CMD_BUFFER_EXEC_MODE_EMIT,
+   ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT,
+   ANV_CMD_BUFFER_EXEC_MODE_CHAIN,
+   ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN,
+   ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN,
+};
+
+struct anv_measure_batch;
+
+struct anv_cmd_buffer {
+   struct vk_command_buffer                     vk;
+
+   struct anv_device *                          device;
+   struct anv_queue_family *                    queue_family;
+
+   struct anv_batch                             batch;
+
+   /* Pointer to the location in the batch where MI_BATCH_BUFFER_END was
+    * recorded upon calling vkEndCommandBuffer(). This is useful if we need to
+    * rewrite the end to chain multiple batch together at vkQueueSubmit().
+    */
+   void *                                       batch_end;
+
+   /* Fields required for the actual chain of anv_batch_bo's.
+    *
+    * These fields are initialized by anv_cmd_buffer_init_batch_bo_chain().
+    */
+   struct list_head                             batch_bos;
+   enum anv_cmd_buffer_exec_mode                exec_mode;
+
+   /* A vector of anv_batch_bo pointers for every batch or surface buffer
+    * referenced by this command buffer
+    *
+    * initialized by anv_cmd_buffer_init_batch_bo_chain()
+    */
+   struct u_vector                            seen_bbos;
+
+   /* A vector of int32_t's for every block of binding tables.
+    *
+    * initialized by anv_cmd_buffer_init_batch_bo_chain()
+    */
+   struct u_vector                              bt_block_states;
+   struct anv_state                             bt_next;
+
+   struct anv_reloc_list                        surface_relocs;
+   /** Last seen surface state block pool center bo offset */
+   uint32_t                                     last_ss_pool_center;
+
+   /* Serial for tracking buffer completion */
+   uint32_t                                     serial;
+
+   /* Stream objects for storing temporary data */
+   struct anv_state_stream                      surface_state_stream;
+   struct anv_state_stream                      dynamic_state_stream;
+   struct anv_state_stream                      general_state_stream;
+
+   VkCommandBufferUsageFlags                    usage_flags;
+
+   struct anv_query_pool                       *perf_query_pool;
+
+   struct anv_cmd_state                         state;
+
+   struct anv_address                           return_addr;
+
+   /* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */
+   uint64_t                                     intel_perf_marker;
+
+   struct anv_measure_batch *measure;
+
+   /**
+    * KHR_performance_query requires self modifying command buffers and this
+    * array has the location of modifying commands to the query begin and end
+    * instructions storing performance counters. The array length is
+    * anv_physical_device::n_perf_query_commands.
+    */
+   struct mi_address_token                  *self_mod_locations;
+
+   /**
+    * Index tracking which of the self_mod_locations items have already been
+    * used.
+    */
+   uint32_t                                      perf_reloc_idx;
+
+   /**
+    * Sum of all the anv_batch_bo sizes allocated for this command buffer.
+    * Used to increase allocation size for long command buffers.
+    */
+   uint32_t                                     total_batch_size;
+
+   /**
+    *
+    */
+   struct u_trace                               trace;
+};
+
+/* Determine whether we can chain a given cmd_buffer to another one. We need
+ * softpin and we also need to make sure that we can edit the end of the batch
+ * to point to next one, which requires the command buffer to not be used
+ * simultaneously.
+ */
+static inline bool
+anv_cmd_buffer_is_chainable(struct anv_cmd_buffer *cmd_buffer)
+{
+   return !anv_use_relocations(cmd_buffer->device->physical) &&
+      !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT);
+}
+
+VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
+                                  struct anv_cmd_buffer *secondary);
+void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer);
+VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue,
+                                struct anv_cmd_buffer *cmd_buffer,
+                                const VkSemaphore *in_semaphores,
+                                const uint64_t *in_wait_values,
+                                uint32_t num_in_semaphores,
+                                const VkSemaphore *out_semaphores,
+                                const uint64_t *out_signal_values,
+                                uint32_t num_out_semaphores,
+                                VkFence fence,
+                                int perf_query_pass);
+
+VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);
+
+struct anv_state anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                                             const void *data, uint32_t size, uint32_t alignment);
+struct anv_state anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
+                                              uint32_t *a, uint32_t *b,
+                                              uint32_t dwords, uint32_t alignment);
+
+struct anv_address
+anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer);
+struct anv_state
+anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t entries, uint32_t *state_offset);
+struct anv_state
+anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer);
+struct anv_state
+anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment);
+
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+
+struct anv_state
+anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer);
+struct anv_state
+anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer);
+
+VkResult
+anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                                         uint32_t num_entries,
+                                         uint32_t *state_offset,
+                                         struct anv_state *bt_state);
+
+void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);
+
+void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer);
+
+enum anv_bo_sync_state {
+   /** Indicates that this is a new (or newly reset fence) */
+   ANV_BO_SYNC_STATE_RESET,
+
+   /** Indicates that this fence has been submitted to the GPU but is still
+    * (as far as we know) in use by the GPU.
+    */
+   ANV_BO_SYNC_STATE_SUBMITTED,
+
+   ANV_BO_SYNC_STATE_SIGNALED,
+};
+
+struct anv_bo_sync {
+   struct vk_sync sync;
+
+   enum anv_bo_sync_state state;
+   struct anv_bo *bo;
+};
+
+extern const struct vk_sync_type anv_bo_sync_type;
+
+static inline bool
+vk_sync_is_anv_bo_sync(const struct vk_sync *sync)
+{
+   return sync->type == &anv_bo_sync_type;
+}
+
+VkResult anv_create_sync_for_memory(struct vk_device *device,
+                                    VkDeviceMemory memory,
+                                    bool signal_memory,
+                                    struct vk_sync **sync_out);
+
+struct anv_event {
+   struct vk_object_base                        base;
+   uint64_t                                     semaphore;
+   struct anv_state                             state;
+};
+
+#define ANV_STAGE_MASK ((1 << MESA_VULKAN_SHADER_STAGES) - 1)
+
+#define anv_foreach_stage(stage, stage_bits)                         \
+   for (gl_shader_stage stage,                                       \
+        __tmp = (gl_shader_stage)((stage_bits) & ANV_STAGE_MASK);    \
+        stage = __builtin_ffs(__tmp) - 1, __tmp;                     \
+        __tmp &= ~(1 << (stage)))
+
+struct anv_pipeline_bind_map {
+   unsigned char                                surface_sha1[20];
+   unsigned char                                sampler_sha1[20];
+   unsigned char                                push_sha1[20];
+
+   uint32_t surface_count;
+   uint32_t sampler_count;
+
+   struct anv_pipeline_binding *                surface_to_descriptor;
+   struct anv_pipeline_binding *                sampler_to_descriptor;
+
+   struct anv_push_range                        push_ranges[4];
+};
+
+struct anv_shader_bin {
+   struct vk_pipeline_cache_object base;
+
+   gl_shader_stage stage;
+
+   struct anv_state kernel;
+   uint32_t kernel_size;
+
+   const struct brw_stage_prog_data *prog_data;
+   uint32_t prog_data_size;
+
+   struct brw_compile_stats stats[3];
+   uint32_t num_stats;
+
+   struct nir_xfb_info *xfb_info;
+
+   struct anv_pipeline_bind_map bind_map;
+};
+
+struct anv_shader_bin *
+anv_shader_bin_create(struct anv_device *device,
+                      gl_shader_stage stage,
+                      const void *key, uint32_t key_size,
+                      const void *kernel, uint32_t kernel_size,
+                      const struct brw_stage_prog_data *prog_data,
+                      uint32_t prog_data_size,
+                      const struct brw_compile_stats *stats, uint32_t num_stats,
+                      const struct nir_xfb_info *xfb_info,
+                      const struct anv_pipeline_bind_map *bind_map);
+
+static inline void
+anv_shader_bin_ref(struct anv_shader_bin *shader)
+{
+   vk_pipeline_cache_object_ref(&shader->base);
+}
+
+static inline void
+anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
+{
+   vk_pipeline_cache_object_unref(&shader->base);
+}
+
+#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({             \
+   assert((local_arg_offset) % 8 == 0);                              \
+   const struct brw_bs_prog_data *prog_data =                        \
+      brw_bs_prog_data_const(bin->prog_data);                        \
+   assert(prog_data->simd_size == 8 || prog_data->simd_size == 16);  \
+                                                                     \
+   (struct GFX_BINDLESS_SHADER_RECORD) {                             \
+      .OffsetToLocalArguments = (local_arg_offset) / 8,              \
+      .BindlessShaderDispatchMode =                                  \
+         prog_data->simd_size == 16 ? RT_SIMD16 : RT_SIMD8,          \
+      .KernelStartPointer = bin->kernel.offset,                      \
+   };                                                                \
+})
+
+struct anv_pipeline_executable {
+   gl_shader_stage stage;
+
+   struct brw_compile_stats stats;
+
+   char *nir;
+   char *disasm;
+};
+
+enum anv_pipeline_type {
+   ANV_PIPELINE_GRAPHICS,
+   ANV_PIPELINE_COMPUTE,
+   ANV_PIPELINE_RAY_TRACING,
+};
+
+struct anv_pipeline {
+   struct vk_object_base                        base;
+
+   struct anv_device *                          device;
+
+   struct anv_batch                             batch;
+   struct anv_reloc_list                        batch_relocs;
+
+   void *                                       mem_ctx;
+
+   enum anv_pipeline_type                       type;
+   VkPipelineCreateFlags                        flags;
+
+   uint32_t                                     ray_queries;
+
+   struct util_dynarray                         executables;
+
+   const struct intel_l3_config *               l3_config;
+};
+
+struct anv_graphics_pipeline {
+   struct anv_pipeline                          base;
+
+   /* Shaders */
+   struct anv_shader_bin *                      shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+
+   VkShaderStageFlags                           active_stages;
+
+   struct vk_sample_locations_state             sample_locations;
+   struct vk_dynamic_graphics_state             dynamic_state;
+
+   /* These fields are required with dynamic primitive topology,
+    * rasterization_samples used only with gen < 8.
+    */
+   VkLineRasterizationModeEXT                   line_mode;
+   VkPolygonMode                                polygon_mode;
+   uint32_t                                     patch_control_points;
+   uint32_t                                     rasterization_samples;
+
+   VkColorComponentFlags                        color_comp_writes[MAX_RTS];
+
+   uint32_t                                     view_mask;
+   uint32_t                                     instance_multiplier;
+
+   bool                                         depth_clamp_enable;
+   bool                                         depth_clip_enable;
+   bool                                         kill_pixel;
+   bool                                         force_fragment_thread_dispatch;
+   bool                                         negative_one_to_one;
+
+   uint32_t                                     vb_used;
+   struct anv_pipeline_vertex_binding {
+      uint32_t                                  stride;
+      bool                                      instanced;
+      uint32_t                                  instance_divisor;
+   } vb[MAX_VBS];
+
+   /* Pre computed CS instructions that can directly be copied into
+    * anv_cmd_buffer.
+    */
+   uint32_t                                     batch_data[512];
+
+   /* Pre packed CS instructions & structures that need to be merged later
+    * with dynamic state.
+    */
+   struct {
+      uint32_t                                  sf[7];
+      uint32_t                                  clip[4];
+      uint32_t                                  xfb_bo_pitch[4];
+      uint32_t                                  wm[3];
+      uint32_t                                  blend_state[MAX_RTS * 2];
+      uint32_t                                  streamout_state[3];
+   } gfx7;
+
+   struct {
+      uint32_t                                  sf[4];
+      uint32_t                                  raster[5];
+      uint32_t                                  wm[2];
+      uint32_t                                  ps_blend[2];
+      uint32_t                                  blend_state[1 + MAX_RTS * 2];
+      uint32_t                                  streamout_state[5];
+   } gfx8;
+};
+
+struct anv_compute_pipeline {
+   struct anv_pipeline                          base;
+
+   struct anv_shader_bin *                      cs;
+   uint32_t                                     batch_data[9];
+   uint32_t                                     interface_descriptor_data[8];
+};
+
+struct anv_rt_shader_group {
+   VkRayTracingShaderGroupTypeKHR type;
+
+   struct anv_shader_bin *general;
+   struct anv_shader_bin *closest_hit;
+   struct anv_shader_bin *any_hit;
+   struct anv_shader_bin *intersection;
+
+   /* VK_KHR_ray_tracing requires shaderGroupHandleSize == 32 */
+   uint32_t handle[8];
+};
+
+struct anv_ray_tracing_pipeline {
+   struct anv_pipeline                          base;
+
+   /* All shaders in the pipeline */
+   struct util_dynarray                         shaders;
+
+   uint32_t                                     group_count;
+   struct anv_rt_shader_group *                 groups;
+
+   /* If non-zero, this is the default computed stack size as per the stack
+    * size computation in the Vulkan spec.  If zero, that indicates that the
+    * client has requested a dynamic stack size.
+    */
+   uint32_t                                     stack_size;
+};
+
+#define ANV_DECL_PIPELINE_DOWNCAST(pipe_type, pipe_enum)             \
+   static inline struct anv_##pipe_type##_pipeline *                 \
+   anv_pipeline_to_##pipe_type(struct anv_pipeline *pipeline)      \
+   {                                                                 \
+      assert(pipeline->type == pipe_enum);                           \
+      return (struct anv_##pipe_type##_pipeline *) pipeline;         \
+   }
+
+ANV_DECL_PIPELINE_DOWNCAST(graphics, ANV_PIPELINE_GRAPHICS)
+ANV_DECL_PIPELINE_DOWNCAST(compute, ANV_PIPELINE_COMPUTE)
+ANV_DECL_PIPELINE_DOWNCAST(ray_tracing, ANV_PIPELINE_RAY_TRACING)
+
+static inline bool
+anv_pipeline_has_stage(const struct anv_graphics_pipeline *pipeline,
+                       gl_shader_stage stage)
+{
+   return (pipeline->active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+}
+
+static inline bool
+anv_pipeline_is_primitive(const struct anv_graphics_pipeline *pipeline)
+{
+   return anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX);
+}
+
+static inline bool
+anv_pipeline_is_mesh(const struct anv_graphics_pipeline *pipeline)
+{
+   return anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH);
+}
+
+static inline bool
+anv_cmd_buffer_all_color_write_masked(const struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint8_t color_writes = dyn->cb.color_write_enables;
+
+   /* All writes disabled through vkCmdSetColorWriteEnableEXT */
+   if ((color_writes & ((1u << state->color_att_count) - 1)) == 0)
+      return true;
+
+   /* Or all write masks are empty */
+   for (uint32_t i = 0; i < state->color_att_count; i++) {
+      if (state->pipeline->color_comp_writes[i] != 0)
+         return false;
+   }
+
+   return true;
+}
+
+#define ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(prefix, stage)             \
+static inline const struct brw_##prefix##_prog_data *                   \
+get_##prefix##_prog_data(const struct anv_graphics_pipeline *pipeline)  \
+{                                                                       \
+   if (anv_pipeline_has_stage(pipeline, stage)) {                       \
+      return (const struct brw_##prefix##_prog_data *)                  \
+             pipeline->shaders[stage]->prog_data;                       \
+   } else {                                                             \
+      return NULL;                                                      \
+   }                                                                    \
+}
+
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(vs, MESA_SHADER_VERTEX)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tcs, MESA_SHADER_TESS_CTRL)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tes, MESA_SHADER_TESS_EVAL)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(gs, MESA_SHADER_GEOMETRY)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(wm, MESA_SHADER_FRAGMENT)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(mesh, MESA_SHADER_MESH)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(task, MESA_SHADER_TASK)
+
+static inline const struct brw_cs_prog_data *
+get_cs_prog_data(const struct anv_compute_pipeline *pipeline)
+{
+   assert(pipeline->cs);
+   return (const struct brw_cs_prog_data *) pipeline->cs->prog_data;
+}
+
+static inline const struct brw_vue_prog_data *
+anv_pipeline_get_last_vue_prog_data(const struct anv_graphics_pipeline *pipeline)
+{
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
+      return &get_gs_prog_data(pipeline)->base;
+   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+      return &get_tes_prog_data(pipeline)->base;
+   else
+      return &get_vs_prog_data(pipeline)->base;
+}
+
+VkResult
+anv_device_init_rt_shaders(struct anv_device *device);
+
+void
+anv_device_finish_rt_shaders(struct anv_device *device);
+
+VkResult
+anv_pipeline_init(struct anv_pipeline *pipeline,
+                  struct anv_device *device,
+                  enum anv_pipeline_type type,
+                  VkPipelineCreateFlags flags,
+                  const VkAllocationCallbacks *pAllocator);
+
+void
+anv_pipeline_finish(struct anv_pipeline *pipeline,
+                    struct anv_device *device,
+                    const VkAllocationCallbacks *pAllocator);
+
+struct anv_format_plane {
+   enum isl_format isl_format:16;
+   struct isl_swizzle swizzle;
+
+   /* Whether this plane contains chroma channels */
+   bool has_chroma;
+
+   /* For downscaling of YUV planes */
+   uint8_t denominator_scales[2];
+
+   /* How to map sampled ycbcr planes to a single 4 component element. */
+   struct isl_swizzle ycbcr_swizzle;
+
+   /* What aspect is associated to this plane */
+   VkImageAspectFlags aspect;
+};
+
+
+struct anv_format {
+   struct anv_format_plane planes[3];
+   VkFormat vk_format;
+   uint8_t n_planes;
+   bool can_ycbcr;
+};
+
+static inline void
+anv_assert_valid_aspect_set(VkImageAspectFlags aspects)
+{
+   if (util_bitcount(aspects) == 1) {
+      assert(aspects & (VK_IMAGE_ASPECT_COLOR_BIT |
+                        VK_IMAGE_ASPECT_DEPTH_BIT |
+                        VK_IMAGE_ASPECT_STENCIL_BIT |
+                        VK_IMAGE_ASPECT_PLANE_0_BIT |
+                        VK_IMAGE_ASPECT_PLANE_1_BIT |
+                        VK_IMAGE_ASPECT_PLANE_2_BIT));
+   } else if (aspects & VK_IMAGE_ASPECT_PLANES_BITS_ANV) {
+      assert(aspects == VK_IMAGE_ASPECT_PLANE_0_BIT ||
+             aspects == (VK_IMAGE_ASPECT_PLANE_0_BIT |
+                         VK_IMAGE_ASPECT_PLANE_1_BIT) ||
+             aspects == (VK_IMAGE_ASPECT_PLANE_0_BIT |
+                         VK_IMAGE_ASPECT_PLANE_1_BIT |
+                         VK_IMAGE_ASPECT_PLANE_2_BIT));
+   } else {
+      assert(aspects == (VK_IMAGE_ASPECT_DEPTH_BIT |
+                         VK_IMAGE_ASPECT_STENCIL_BIT));
+   }
+}
+
+/**
+ * Return the aspect's plane relative to all_aspects.  For an image, for
+ * instance, all_aspects would be the set of aspects in the image.  For
+ * an image view, all_aspects would be the subset of aspects represented
+ * by that particular view.
+ */
+static inline uint32_t
+anv_aspect_to_plane(VkImageAspectFlags all_aspects,
+                    VkImageAspectFlagBits aspect)
+{
+   anv_assert_valid_aspect_set(all_aspects);
+   assert(util_bitcount(aspect) == 1);
+   assert(!(aspect & ~all_aspects));
+
+   /* Because we always put image and view planes in aspect-bit-order, the
+    * plane index is the number of bits in all_aspects before aspect.
+    */
+   return util_bitcount(all_aspects & (aspect - 1));
+}
+
+#define anv_foreach_image_aspect_bit(b, image, aspects) \
+   u_foreach_bit(b, vk_image_expand_aspect_mask(&(image)->vk, aspects))
+
+const struct anv_format *
+anv_get_format(VkFormat format);
+
+static inline uint32_t
+anv_get_format_planes(VkFormat vk_format)
+{
+   const struct anv_format *format = anv_get_format(vk_format);
+
+   return format != NULL ? format->n_planes : 0;
+}
+
+struct anv_format_plane
+anv_get_format_plane(const struct intel_device_info *devinfo,
+                     VkFormat vk_format, uint32_t plane,
+                     VkImageTiling tiling);
+
+struct anv_format_plane
+anv_get_format_aspect(const struct intel_device_info *devinfo,
+                      VkFormat vk_format,
+                      VkImageAspectFlagBits aspect, VkImageTiling tiling);
+
+static inline enum isl_format
+anv_get_isl_format(const struct intel_device_info *devinfo, VkFormat vk_format,
+                   VkImageAspectFlags aspect, VkImageTiling tiling)
+{
+   return anv_get_format_aspect(devinfo, vk_format, aspect, tiling).isl_format;
+}
+
+bool anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+                                  VkImageCreateFlags create_flags,
+                                  VkFormat vk_format, VkImageTiling vk_tiling,
+                                  VkImageUsageFlags vk_usage,
+                                  const VkImageFormatListCreateInfo *fmt_list);
+
+extern VkFormat
+vk_format_from_android(unsigned android_format, unsigned android_usage);
+
+static inline struct isl_swizzle
+anv_swizzle_for_render(struct isl_swizzle swizzle)
+{
+   /* Sometimes the swizzle will have alpha map to one.  We do this to fake
+    * RGB as RGBA for texturing
+    */
+   assert(swizzle.a == ISL_CHANNEL_SELECT_ONE ||
+          swizzle.a == ISL_CHANNEL_SELECT_ALPHA);
+
+   /* But it doesn't matter what we render to that channel */
+   swizzle.a = ISL_CHANNEL_SELECT_ALPHA;
+
+   return swizzle;
+}
+
+void
+anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm);
+
+/**
+ * Describes how each part of anv_image will be bound to memory.
+ */
+struct anv_image_memory_range {
+   /**
+    * Disjoint bindings into which each portion of the image will be bound.
+    *
+    * Binding images to memory can be complicated and invold binding different
+    * portions of the image to different memory objects or regions.  For most
+    * images, everything lives in the MAIN binding and gets bound by
+    * vkBindImageMemory.  For disjoint multi-planar images, each plane has
+    * a unique, disjoint binding and gets bound by vkBindImageMemory2 with
+    * VkBindImagePlaneMemoryInfo.  There may also exist bits of memory which are
+    * implicit or driver-managed and live in special-case bindings.
+    */
+   enum anv_image_memory_binding {
+      /**
+       * Used if and only if image is not multi-planar disjoint. Bound by
+       * vkBindImageMemory2 without VkBindImagePlaneMemoryInfo.
+       */
+      ANV_IMAGE_MEMORY_BINDING_MAIN,
+
+      /**
+       * Used if and only if image is multi-planar disjoint.  Bound by
+       * vkBindImageMemory2 with VkBindImagePlaneMemoryInfo.
+       */
+      ANV_IMAGE_MEMORY_BINDING_PLANE_0,
+      ANV_IMAGE_MEMORY_BINDING_PLANE_1,
+      ANV_IMAGE_MEMORY_BINDING_PLANE_2,
+
+      /**
+       * Driver-private bo. In special cases we may store the aux surface and/or
+       * aux state in this binding.
+       */
+      ANV_IMAGE_MEMORY_BINDING_PRIVATE,
+
+      /** Sentinel */
+      ANV_IMAGE_MEMORY_BINDING_END,
+   } binding;
+
+   /**
+    * Offset is relative to the start of the binding created by
+    * vkBindImageMemory, not to the start of the bo.
+    */
+   uint64_t offset;
+
+   uint64_t size;
+   uint32_t alignment;
+};
+
+/**
+ * Subsurface of an anv_image.
+ */
+struct anv_surface {
+   struct isl_surf isl;
+   struct anv_image_memory_range memory_range;
+};
+
+static inline bool MUST_CHECK
+anv_surface_is_valid(const struct anv_surface *surface)
+{
+   return surface->isl.size_B > 0 && surface->memory_range.size > 0;
+}
+
+struct anv_image {
+   struct vk_image vk;
+
+   uint32_t n_planes;
+
+   /**
+    * Image has multi-planar format and was created with
+    * VK_IMAGE_CREATE_DISJOINT_BIT.
+    */
+   bool disjoint;
+
+   /**
+    * Image was imported from an struct AHardwareBuffer.  We have to delay
+    * final image creation until bind time.
+    */
+   bool from_ahb;
+
+   /**
+    * Image was imported from gralloc with VkNativeBufferANDROID. The gralloc bo
+    * must be released when the image is destroyed.
+    */
+   bool from_gralloc;
+
+   /**
+    * The memory bindings created by vkCreateImage and vkBindImageMemory.
+    *
+    * For details on the image's memory layout, see check_memory_bindings().
+    *
+    * vkCreateImage constructs the `memory_range` for each
+    * anv_image_memory_binding.  After vkCreateImage, each binding is valid if
+    * and only if `memory_range::size > 0`.
+    *
+    * vkBindImageMemory binds each valid `memory_range` to an `address`.
+    * Usually, the app will provide the address via the parameters of
+    * vkBindImageMemory.  However, special-case bindings may be bound to
+    * driver-private memory.
+    */
+   struct anv_image_binding {
+      struct anv_image_memory_range memory_range;
+      struct anv_address address;
+   } bindings[ANV_IMAGE_MEMORY_BINDING_END];
+
+   /**
+    * Image subsurfaces
+    *
+    * For each foo, anv_image::planes[x].surface is valid if and only if
+    * anv_image::aspects has a x aspect. Refer to anv_image_aspect_to_plane()
+    * to figure the number associated with a given aspect.
+    *
+    * The hardware requires that the depth buffer and stencil buffer be
+    * separate surfaces.  From Vulkan's perspective, though, depth and stencil
+    * reside in the same VkImage.  To satisfy both the hardware and Vulkan, we
+    * allocate the depth and stencil buffers as separate surfaces in the same
+    * bo.
+    */
+   struct anv_image_plane {
+      struct anv_surface primary_surface;
+
+      /**
+       * A surface which shadows the main surface and may have different
+       * tiling. This is used for sampling using a tiling that isn't supported
+       * for other operations.
+       */
+      struct anv_surface shadow_surface;
+
+      /**
+       * The base aux usage for this image.  For color images, this can be
+       * either CCS_E or CCS_D depending on whether or not we can reliably
+       * leave CCS on all the time.
+       */
+      enum isl_aux_usage aux_usage;
+
+      struct anv_surface aux_surface;
+
+      /** Location of the fast clear state.  */
+      struct anv_image_memory_range fast_clear_memory_range;
+
+      /**
+       * Whether this image can be fast cleared with non-zero clear colors.
+       * This can happen with mutable images when formats of different bit
+       * sizes per components are used.
+       *
+       * On Gfx9+, because the clear colors are stored as a 4 components 32bit
+       * values, we can clear in R16G16_UNORM (store 2 16bit values in the
+       * components 0 & 1 of the clear color) and then draw in R32_UINT which
+       * would interpret the clear color as a single component value, using
+       * only the first 16bit component of the previous written clear color.
+       *
+       * On Gfx7/7.5/8, only CC_ZERO/CC_ONE clear colors are supported, this
+       * boolean will prevent the usage of CC_ONE.
+       */
+      bool can_non_zero_fast_clear;
+   } planes[3];
+};
+
+static inline bool
+anv_image_is_externally_shared(const struct anv_image *image)
+{
+   return image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID ||
+          image->vk.external_handle_types != 0;
+}
+
+static inline bool
+anv_image_has_private_binding(const struct anv_image *image)
+{
+   const struct anv_image_binding private_binding =
+      image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE];
+   return private_binding.memory_range.size != 0;
+}
+
+/* The ordering of this enum is important */
+enum anv_fast_clear_type {
+   /** Image does not have/support any fast-clear blocks */
+   ANV_FAST_CLEAR_NONE = 0,
+   /** Image has/supports fast-clear but only to the default value */
+   ANV_FAST_CLEAR_DEFAULT_VALUE = 1,
+   /** Image has/supports fast-clear with an arbitrary fast-clear value */
+   ANV_FAST_CLEAR_ANY = 2,
+};
+
+/**
+ * Return the aspect's _format_ plane, not its _memory_ plane (using the
+ * vocabulary of VK_EXT_image_drm_format_modifier). As a consequence, \a
+ * aspect_mask may contain VK_IMAGE_ASPECT_PLANE_*, but must not contain
+ * VK_IMAGE_ASPECT_MEMORY_PLANE_* .
+ */
+static inline uint32_t
+anv_image_aspect_to_plane(const struct anv_image *image,
+                          VkImageAspectFlagBits aspect)
+{
+   return anv_aspect_to_plane(image->vk.aspects, aspect);
+}
+
+/* Returns the number of auxiliary buffer levels attached to an image. */
+static inline uint8_t
+anv_image_aux_levels(const struct anv_image * const image,
+                     VkImageAspectFlagBits aspect)
+{
+   uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
+      return 0;
+
+   return image->vk.mip_levels;
+}
+
+/* Returns the number of auxiliary buffer layers attached to an image. */
+static inline uint32_t
+anv_image_aux_layers(const struct anv_image * const image,
+                     VkImageAspectFlagBits aspect,
+                     const uint8_t miplevel)
+{
+   assert(image);
+
+   /* The miplevel must exist in the main buffer. */
+   assert(miplevel < image->vk.mip_levels);
+
+   if (miplevel >= anv_image_aux_levels(image, aspect)) {
+      /* There are no layers with auxiliary data because the miplevel has no
+       * auxiliary data.
+       */
+      return 0;
+   }
+
+   return MAX2(image->vk.array_layers, image->vk.extent.depth >> miplevel);
+}
+
+static inline struct anv_address MUST_CHECK
+anv_image_address(const struct anv_image *image,
+                  const struct anv_image_memory_range *mem_range)
+{
+   const struct anv_image_binding *binding = &image->bindings[mem_range->binding];
+   assert(binding->memory_range.offset == 0);
+
+   if (mem_range->size == 0)
+      return ANV_NULL_ADDRESS;
+
+   return anv_address_add(binding->address, mem_range->offset);
+}
+
+static inline struct anv_address
+anv_image_get_clear_color_addr(UNUSED const struct anv_device *device,
+                               const struct anv_image *image,
+                               VkImageAspectFlagBits aspect)
+{
+   assert(image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV |
+                               VK_IMAGE_ASPECT_DEPTH_BIT));
+
+   uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   const struct anv_image_memory_range *mem_range =
+      &image->planes[plane].fast_clear_memory_range;
+
+   return anv_image_address(image, mem_range);
+}
+
+static inline struct anv_address
+anv_image_get_fast_clear_type_addr(const struct anv_device *device,
+                                   const struct anv_image *image,
+                                   VkImageAspectFlagBits aspect)
+{
+   struct anv_address addr =
+      anv_image_get_clear_color_addr(device, image, aspect);
+
+   const unsigned clear_color_state_size = device->info->ver >= 10 ?
+      device->isl_dev.ss.clear_color_state_size :
+      device->isl_dev.ss.clear_value_size;
+   return anv_address_add(addr, clear_color_state_size);
+}
+
+static inline struct anv_address
+anv_image_get_compression_state_addr(const struct anv_device *device,
+                                     const struct anv_image *image,
+                                     VkImageAspectFlagBits aspect,
+                                     uint32_t level, uint32_t array_layer)
+{
+   assert(level < anv_image_aux_levels(image, aspect));
+   assert(array_layer < anv_image_aux_layers(image, aspect, level));
+   UNUSED uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E);
+
+   /* Relative to start of the plane's fast clear memory range */
+   uint32_t offset;
+
+   offset = 4; /* Go past the fast clear type */
+
+   if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+      for (uint32_t l = 0; l < level; l++)
+         offset += anv_minify(image->vk.extent.depth, l) * 4;
+   } else {
+      offset += level * image->vk.array_layers * 4;
+   }
+
+   offset += array_layer * 4;
+
+   assert(offset < image->planes[plane].fast_clear_memory_range.size);
+
+   return anv_address_add(
+      anv_image_get_fast_clear_type_addr(device, image, aspect),
+      offset);
+}
+
+/* Returns true if a HiZ-enabled depth buffer can be sampled from. */
+static inline bool
+anv_can_sample_with_hiz(const struct intel_device_info * const devinfo,
+                        const struct anv_image *image)
+{
+   if (!(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+      return false;
+
+   /* For Gfx8-11, there are some restrictions around sampling from HiZ.
+    * The Skylake PRM docs for RENDER_SURFACE_STATE::AuxiliarySurfaceMode
+    * say:
+    *
+    *    "If this field is set to AUX_HIZ, Number of Multisamples must
+    *    be MULTISAMPLECOUNT_1, and Surface Type cannot be SURFTYPE_3D."
+    */
+   if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+      return false;
+
+   /* Allow this feature on BDW even though it is disabled in the BDW devinfo
+    * struct. There's documentation which suggests that this feature actually
+    * reduces performance on BDW, but it has only been observed to help so
+    * far. Sampling fast-cleared blocks on BDW must also be handled with care
+    * (see depth_stencil_attachment_compute_aux_usage() for more info).
+    */
+   if (devinfo->ver != 8 && !devinfo->has_sample_with_hiz)
+      return false;
+
+   return image->vk.samples == 1;
+}
+
+/* Returns true if an MCS-enabled buffer can be sampled from. */
+static inline bool
+anv_can_sample_mcs_with_clear(const struct intel_device_info * const devinfo,
+                              const struct anv_image *image)
+{
+   assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   const uint32_t plane =
+      anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_COLOR_BIT);
+
+   assert(isl_aux_usage_has_mcs(image->planes[plane].aux_usage));
+
+   const struct anv_surface *anv_surf = &image->planes[plane].primary_surface;
+
+   /* On TGL, the sampler has an issue with some 8 and 16bpp MSAA fast clears.
+    * See HSD 1707282275, wa_14013111325. Due to the use of
+    * format-reinterpretation, a simplified workaround is implemented.
+    */
+   if (devinfo->ver >= 12 &&
+       isl_format_get_layout(anv_surf->isl.format)->bpb <= 16) {
+      return false;
+   }
+
+   return true;
+}
+
+static inline bool
+anv_image_plane_uses_aux_map(const struct anv_device *device,
+                             const struct anv_image *image,
+                             uint32_t plane)
+{
+   return device->info->has_aux_map &&
+      isl_aux_usage_has_ccs(image->planes[plane].aux_usage);
+}
+
+void
+anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
+                                  const struct anv_image *image,
+                                  VkImageAspectFlagBits aspect,
+                                  enum isl_aux_usage aux_usage,
+                                  uint32_t level,
+                                  uint32_t base_layer,
+                                  uint32_t layer_count);
+
+void
+anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
+                      const struct anv_image *image,
+                      VkImageAspectFlagBits aspect,
+                      enum isl_aux_usage aux_usage,
+                      enum isl_format format, struct isl_swizzle swizzle,
+                      uint32_t level, uint32_t base_layer, uint32_t layer_count,
+                      VkRect2D area, union isl_color_value clear_color);
+void
+anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
+                              const struct anv_image *image,
+                              VkImageAspectFlags aspects,
+                              enum isl_aux_usage depth_aux_usage,
+                              uint32_t level,
+                              uint32_t base_layer, uint32_t layer_count,
+                              VkRect2D area,
+                              float depth_value, uint8_t stencil_value);
+void
+anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+                       const struct anv_image *src_image,
+                       enum isl_aux_usage src_aux_usage,
+                       uint32_t src_level, uint32_t src_base_layer,
+                       const struct anv_image *dst_image,
+                       enum isl_aux_usage dst_aux_usage,
+                       uint32_t dst_level, uint32_t dst_base_layer,
+                       VkImageAspectFlagBits aspect,
+                       uint32_t src_x, uint32_t src_y,
+                       uint32_t dst_x, uint32_t dst_y,
+                       uint32_t width, uint32_t height,
+                       uint32_t layer_count,
+                       enum blorp_filter filter);
+void
+anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 VkImageAspectFlagBits aspect, uint32_t level,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op hiz_op);
+void
+anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
+                    const struct anv_image *image,
+                    VkImageAspectFlags aspects,
+                    uint32_t level,
+                    uint32_t base_layer, uint32_t layer_count,
+                    VkRect2D area, uint8_t stencil_value);
+void
+anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 enum isl_format format, struct isl_swizzle swizzle,
+                 VkImageAspectFlagBits aspect,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op mcs_op, union isl_color_value *clear_value,
+                 bool predicate);
+void
+anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
+                 const struct anv_image *image,
+                 enum isl_format format, struct isl_swizzle swizzle,
+                 VkImageAspectFlagBits aspect, uint32_t level,
+                 uint32_t base_layer, uint32_t layer_count,
+                 enum isl_aux_op ccs_op, union isl_color_value *clear_value,
+                 bool predicate);
+
+void
+anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
+                         const struct anv_image *image,
+                         VkImageAspectFlagBits aspect,
+                         uint32_t base_level, uint32_t level_count,
+                         uint32_t base_layer, uint32_t layer_count);
+
+enum isl_aux_state ATTRIBUTE_PURE
+anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
+                        const struct anv_image *image,
+                        const VkImageAspectFlagBits aspect,
+                        const VkImageLayout layout);
+
+enum isl_aux_usage ATTRIBUTE_PURE
+anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
+                        const struct anv_image *image,
+                        const VkImageAspectFlagBits aspect,
+                        const VkImageUsageFlagBits usage,
+                        const VkImageLayout layout);
+
+enum anv_fast_clear_type ATTRIBUTE_PURE
+anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
+                              const struct anv_image * const image,
+                              const VkImageAspectFlagBits aspect,
+                              const VkImageLayout layout);
+
+static inline bool
+anv_image_aspects_compatible(VkImageAspectFlags aspects1,
+                             VkImageAspectFlags aspects2)
+{
+   if (aspects1 == aspects2)
+      return true;
+
+   /* Only 1 color aspects are compatibles. */
+   if ((aspects1 & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) != 0 &&
+       (aspects2 & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) != 0 &&
+       util_bitcount(aspects1) == util_bitcount(aspects2))
+      return true;
+
+   return false;
+}
+
+struct anv_image_view {
+   struct vk_image_view vk;
+
+   const struct anv_image *image; /**< VkImageViewCreateInfo::image */
+
+   unsigned n_planes;
+   struct {
+      uint32_t image_plane;
+
+      struct isl_view isl;
+
+      /**
+       * RENDER_SURFACE_STATE when using image as a sampler surface with an
+       * image layout of SHADER_READ_ONLY_OPTIMAL or
+       * DEPTH_STENCIL_READ_ONLY_OPTIMAL.
+       */
+      struct anv_surface_state optimal_sampler_surface_state;
+
+      /**
+       * RENDER_SURFACE_STATE when using image as a sampler surface with an
+       * image layout of GENERAL.
+       */
+      struct anv_surface_state general_sampler_surface_state;
+
+      /**
+       * RENDER_SURFACE_STATE when using image as a storage image. Separate
+       * states for vanilla (with the original format) and one which has been
+       * lowered to a format suitable for reading.  This may be a raw surface
+       * in extreme cases or simply a surface with a different format where we
+       * expect some conversion to be done in the shader.
+       */
+      struct anv_surface_state storage_surface_state;
+      struct anv_surface_state lowered_storage_surface_state;
+
+      struct brw_image_param lowered_storage_image_param;
+   } planes[3];
+};
+
+enum anv_image_view_state_flags {
+   ANV_IMAGE_VIEW_STATE_STORAGE_LOWERED      = (1 << 0),
+   ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL      = (1 << 1),
+};
+
+void anv_image_fill_surface_state(struct anv_device *device,
+                                  const struct anv_image *image,
+                                  VkImageAspectFlagBits aspect,
+                                  const struct isl_view *view,
+                                  isl_surf_usage_flags_t view_usage,
+                                  enum isl_aux_usage aux_usage,
+                                  const union isl_color_value *clear_color,
+                                  enum anv_image_view_state_flags flags,
+                                  struct anv_surface_state *state_inout,
+                                  struct brw_image_param *image_param_out);
+
+struct anv_image_create_info {
+   const VkImageCreateInfo *vk_info;
+
+   /** An opt-in bitmask which filters an ISL-mapping of the Vulkan tiling. */
+   isl_tiling_flags_t isl_tiling_flags;
+
+   /** These flags will be added to any derived from VkImageCreateInfo. */
+   isl_surf_usage_flags_t isl_extra_usage_flags;
+};
+
+VkResult anv_image_init(struct anv_device *device, struct anv_image *image,
+                        const struct anv_image_create_info *create_info);
+
+void anv_image_finish(struct anv_image *image);
+
+void anv_image_get_memory_requirements(struct anv_device *device,
+                                       struct anv_image *image,
+                                       VkImageAspectFlags aspects,
+                                       VkMemoryRequirements2 *pMemoryRequirements);
+
+enum isl_format
+anv_isl_format_for_descriptor_type(const struct anv_device *device,
+                                   VkDescriptorType type);
+
+static inline uint32_t
+anv_rasterization_aa_mode(VkPolygonMode raster_mode,
+                          VkLineRasterizationModeEXT line_mode)
+{
+   if (raster_mode == VK_POLYGON_MODE_LINE &&
+       line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT)
+      return true;
+   return false;
+}
+
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct intel_device_info *devinfo,
+                               VkFormat vk_format,
+                               const struct anv_format *anv_format,
+                               VkImageTiling vk_tiling,
+                               const struct isl_drm_modifier_info *isl_mod_info);
+
+void anv_fill_buffer_surface_state(struct anv_device *device,
+                                   struct anv_state state,
+                                   enum isl_format format,
+                                   struct isl_swizzle swizzle,
+                                   isl_surf_usage_flags_t usage,
+                                   struct anv_address address,
+                                   uint32_t range, uint32_t stride);
+
+
+/* Haswell border color is a bit of a disaster.  Float and unorm formats use a
+ * straightforward 32-bit float color in the first 64 bytes.  Instead of using
+ * a nice float/integer union like Gfx8+, Haswell specifies the integer border
+ * color as a separate entry /after/ the float color.  The layout of this entry
+ * also depends on the format's bpp (with extra hacks for RG32), and overlaps.
+ *
+ * Since we don't know the format/bpp, we can't make any of the border colors
+ * containing '1' work for all formats, as it would be in the wrong place for
+ * some of them.  We opt to make 32-bit integers work as this seems like the
+ * most common option.  Fortunately, transparent black works regardless, as
+ * all zeroes is the same in every bit-size.
+ */
+struct hsw_border_color {
+   float float32[4];
+   uint32_t _pad0[12];
+   uint32_t uint32[4];
+   uint32_t _pad1[108];
+};
+
+struct gfx8_border_color {
+   union {
+      float float32[4];
+      uint32_t uint32[4];
+   };
+   /* Pad out to 64 bytes */
+   uint32_t _pad[12];
+};
+
+struct anv_ycbcr_conversion {
+   struct vk_object_base base;
+
+   const struct anv_format *        format;
+   VkSamplerYcbcrModelConversion    ycbcr_model;
+   VkSamplerYcbcrRange              ycbcr_range;
+   VkComponentSwizzle               mapping[4];
+   VkChromaLocation                 chroma_offsets[2];
+   VkFilter                         chroma_filter;
+   bool                             chroma_reconstruction;
+};
+
+struct anv_sampler {
+   struct vk_object_base        base;
+
+   uint32_t                     state[3][4];
+   uint32_t                     n_planes;
+   struct anv_ycbcr_conversion *conversion;
+
+   /* Blob of sampler state data which is guaranteed to be 32-byte aligned
+    * and with a 32-byte stride for use as bindless samplers.
+    */
+   struct anv_state             bindless_state;
+
+   struct anv_state             custom_border_color;
+};
+
+#define ANV_PIPELINE_STATISTICS_MASK 0x000007ff
+
+struct anv_query_pool {
+   struct vk_object_base                        base;
+
+   VkQueryType                                  type;
+   VkQueryPipelineStatisticFlags                pipeline_statistics;
+   /** Stride between slots, in bytes */
+   uint32_t                                     stride;
+   /** Number of slots in this query pool */
+   uint32_t                                     slots;
+   struct anv_bo *                              bo;
+
+   /* KHR perf queries : */
+   uint32_t                                     pass_size;
+   uint32_t                                     data_offset;
+   uint32_t                                     snapshot_size;
+   uint32_t                                     n_counters;
+   struct intel_perf_counter_pass                *counter_pass;
+   uint32_t                                     n_passes;
+   struct intel_perf_query_info                 **pass_query;
+};
+
+static inline uint32_t khr_perf_query_preamble_offset(const struct anv_query_pool *pool,
+                                                      uint32_t pass)
+{
+   return pool->pass_size * pass + 8;
+}
+
+struct anv_acceleration_structure {
+   struct vk_object_base                        base;
+
+   VkDeviceSize                                 size;
+   struct anv_address                           address;
+};
+
+void
+anv_dump_pipe_bits(enum anv_pipe_bits bits);
+
+static inline void
+anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
+                          enum anv_pipe_bits bits,
+                          const char* reason)
+{
+   cmd_buffer->state.pending_pipe_bits |= bits;
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits)
+   {
+      fputs("pc: add ", stderr);
+      anv_dump_pipe_bits(bits);
+      fprintf(stderr, "reason: %s\n", reason);
+   }
+}
+
+struct anv_performance_configuration_intel {
+   struct vk_object_base      base;
+
+   struct intel_perf_registers *register_config;
+
+   uint64_t                   config_id;
+};
+
+void anv_physical_device_init_perf(struct anv_physical_device *device, int fd);
+void anv_device_perf_init(struct anv_device *device);
+void anv_perf_write_pass_results(struct intel_perf_config *perf,
+                                 struct anv_query_pool *pool, uint32_t pass,
+                                 const struct intel_perf_query_result *accumulated_results,
+                                 union VkPerformanceCounterResultKHR *results);
+
+/* Use to emit a series of memcpy operations */
+struct anv_memcpy_state {
+   struct anv_device *device;
+   struct anv_batch *batch;
+
+   struct anv_vb_cache_range vb_bound;
+   struct anv_vb_cache_range vb_dirty;
+};
+
+struct anv_utrace_flush_copy {
+   /* Needs to be the first field */
+   struct intel_ds_flush_data ds;
+
+   /* Batch stuff to implement of copy of timestamps recorded in another
+    * buffer.
+    */
+   struct anv_reloc_list relocs;
+   struct anv_batch batch;
+   struct anv_bo *batch_bo;
+
+   /* Buffer of 64bits timestamps */
+   struct anv_bo *trace_bo;
+
+   /* Syncobj to be signaled when the batch completes */
+   struct vk_sync *sync;
+
+   /* Queue on which all the recorded traces are submitted */
+   struct anv_queue *queue;
+
+   struct anv_memcpy_state memcpy_state;
+};
+
+void anv_device_utrace_init(struct anv_device *device);
+void anv_device_utrace_finish(struct anv_device *device);
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+                                    uint32_t cmd_buffer_count,
+                                    struct anv_cmd_buffer **cmd_buffers,
+                                    struct anv_utrace_flush_copy **out_flush_data);
+
+#ifdef HAVE_PERFETTO
+void anv_perfetto_init(void);
+uint64_t anv_perfetto_begin_submit(struct anv_queue *queue);
+void anv_perfetto_end_submit(struct anv_queue *queue, uint32_t submission_id,
+                             uint64_t start_ts);
+#else
+static inline void anv_perfetto_init(void)
+{
+}
+static inline uint64_t anv_perfetto_begin_submit(struct anv_queue *queue)
+{
+   return 0;
+}
+static inline void anv_perfetto_end_submit(struct anv_queue *queue,
+                                           uint32_t submission_id,
+                                           uint64_t start_ts)
+{}
+#endif
+
+
+#define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
+   VK_FROM_HANDLE(__anv_type, __name, __handle)
+
+VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(anv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(anv_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(anv_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(anv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_acceleration_structure, base,
+                               VkAccelerationStructureKHR,
+                               VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, vk.base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_update_template, base,
+                               VkDescriptorUpdateTemplate,
+                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW);
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_ycbcr_conversion, base,
+                               VkSamplerYcbcrConversion,
+                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
+                               VkPerformanceConfigurationINTEL,
+                               VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL)
+
+#define anv_genX(devinfo, thing) ({             \
+   __typeof(&gfx9_##thing) genX_thing;          \
+   switch ((devinfo)->verx10) {                 \
+   case 70:                                     \
+      genX_thing = &gfx7_##thing;               \
+      break;                                    \
+   case 75:                                     \
+      genX_thing = &gfx75_##thing;              \
+      break;                                    \
+   case 80:                                     \
+      genX_thing = &gfx8_##thing;               \
+      break;                                    \
+   case 90:                                     \
+      genX_thing = &gfx9_##thing;               \
+      break;                                    \
+   case 110:                                    \
+      genX_thing = &gfx11_##thing;              \
+      break;                                    \
+   case 120:                                    \
+      genX_thing = &gfx12_##thing;              \
+      break;                                    \
+   case 125:                                    \
+      genX_thing = &gfx125_##thing;             \
+      break;                                    \
+   default:                                     \
+      unreachable("Unknown hardware generation"); \
+   }                                            \
+   genX_thing;                                  \
+})
+
+/* Gen-specific function declarations */
+#ifdef genX
+#  include "anv_genX.h"
+#else
+#  define genX(x) gfx7_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx75_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx8_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx9_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx11_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx12_##x
+#  include "anv_genX.h"
+#  undef genX
+#  define genX(x) gfx125_##x
+#  include "anv_genX.h"
+#  undef genX
+#endif
+
+#endif /* ANV_PRIVATE_H */
diff --git a/src/intel/vulkan_hasvk/anv_queue.c b/src/intel/vulkan_hasvk/anv_queue.c
new file mode 100644
index 00000000000..2cada846753
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_queue.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file implements VkQueue
+ */
+
+#include "anv_private.h"
+
+VkResult
+anv_queue_init(struct anv_device *device, struct anv_queue *queue,
+               uint32_t exec_flags,
+               const VkDeviceQueueCreateInfo *pCreateInfo,
+               uint32_t index_in_family)
+{
+   struct anv_physical_device *pdevice = device->physical;
+   VkResult result;
+
+   result = vk_queue_init(&queue->vk, &device->vk, pCreateInfo,
+                          index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (INTEL_DEBUG(DEBUG_SYNC)) {
+      result = vk_sync_create(&device->vk,
+                              &device->physical->sync_syncobj_type,
+                              0, 0, &queue->sync);
+      if (result != VK_SUCCESS) {
+         vk_queue_finish(&queue->vk);
+         return result;
+      }
+   }
+
+   queue->vk.driver_submit = anv_queue_submit;
+
+   queue->device = device;
+
+   assert(queue->vk.queue_family_index < pdevice->queue.family_count);
+   queue->family = &pdevice->queue.families[queue->vk.queue_family_index];
+
+   queue->index_in_family = index_in_family;
+
+   queue->exec_flags = exec_flags;
+
+   return VK_SUCCESS;
+}
+
+void
+anv_queue_finish(struct anv_queue *queue)
+{
+   if (queue->sync)
+      vk_sync_destroy(&queue->device->vk, queue->sync);
+
+   vk_queue_finish(&queue->vk);
+}
diff --git a/src/intel/vulkan_hasvk/anv_util.c b/src/intel/vulkan_hasvk/anv_util.c
new file mode 100644
index 00000000000..988010232fe
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_util.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "anv_private.h"
+#include "vk_enum_to_str.h"
+
+void
+__anv_perf_warn(struct anv_device *device,
+                const struct vk_object_base *object,
+                const char *file, int line, const char *format, ...)
+{
+   va_list ap;
+   char buffer[256];
+
+   va_start(ap, format);
+   vsnprintf(buffer, sizeof(buffer), format, ap);
+   va_end(ap);
+
+   if (object) {
+      __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+               VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+               VK_LOG_OBJS(object), file, line,
+               "PERF: %s", buffer);
+   } else {
+      __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+               VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+               VK_LOG_NO_OBJS(device->physical->instance), file, line,
+               "PERF: %s", buffer);
+   }
+}
+
+void
+anv_dump_pipe_bits(enum anv_pipe_bits bits)
+{
+   if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
+      fputs("+depth_flush ", stderr);
+   if (bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT)
+      fputs("+dc_flush ", stderr);
+   if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
+      fputs("+hdc_flush ", stderr);
+   if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+      fputs("+rt_flush ", stderr);
+   if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
+      fputs("+tile_flush ", stderr);
+   if (bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT)
+      fputs("+state_inval ", stderr);
+   if (bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT)
+      fputs("+const_inval ", stderr);
+   if (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)
+      fputs("+vf_inval ", stderr);
+   if (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT)
+      fputs("+tex_inval ", stderr);
+   if (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)
+      fputs("+ic_inval ", stderr);
+   if (bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT)
+      fputs("+pb_stall ", stderr);
+   if (bits & ANV_PIPE_PSS_STALL_SYNC_BIT)
+      fputs("+pss_stall ", stderr);
+   if (bits & ANV_PIPE_DEPTH_STALL_BIT)
+      fputs("+depth_stall ", stderr);
+   if (bits & ANV_PIPE_CS_STALL_BIT)
+      fputs("+cs_stall ", stderr);
+   if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+      fputs("+utdp_flush", stderr);
+}
diff --git a/src/intel/vulkan_hasvk/anv_utrace.c b/src/intel/vulkan_hasvk/anv_utrace.c
new file mode 100644
index 00000000000..965be744411
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_utrace.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "perf/intel_perf.h"
+
+static uint32_t
+command_buffers_count_utraces(struct anv_device *device,
+                              uint32_t cmd_buffer_count,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t *utrace_copies)
+{
+   if (!u_trace_context_actively_tracing(&device->ds.trace_context))
+      return 0;
+
+   uint32_t utraces = 0;
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      if (u_trace_has_points(&cmd_buffers[i]->trace)) {
+         utraces++;
+         if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
+            *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
+      }
+   }
+
+   return utraces;
+}
+
+static void
+anv_utrace_delete_flush_data(struct u_trace_context *utctx,
+                             void *flush_data)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_utrace_flush_copy *flush = flush_data;
+
+   intel_ds_flush_data_fini(&flush->ds);
+
+   if (flush->trace_bo) {
+      assert(flush->batch_bo);
+      anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
+      anv_device_release_bo(device, flush->batch_bo);
+      anv_device_release_bo(device, flush->trace_bo);
+   }
+
+   vk_sync_destroy(&device->vk, flush->sync);
+
+   vk_free(&device->vk.alloc, flush);
+}
+
+static void
+anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
+                                      void *cmdstream,
+                                      void *ts_from, uint32_t from_offset,
+                                      void *ts_to, uint32_t to_offset,
+                                      uint32_t count)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_utrace_flush_copy *flush = cmdstream;
+   struct anv_address from_addr = (struct anv_address) {
+      .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
+   struct anv_address to_addr = (struct anv_address) {
+      .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
+
+   anv_genX(device->info, emit_so_memcpy)(&flush->memcpy_state,
+                                           to_addr, from_addr, count * sizeof(uint64_t));
+}
+
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+                                    uint32_t cmd_buffer_count,
+                                    struct anv_cmd_buffer **cmd_buffers,
+                                    struct anv_utrace_flush_copy **out_flush_data)
+{
+   struct anv_device *device = queue->device;
+   uint32_t utrace_copies = 0;
+   uint32_t utraces = command_buffers_count_utraces(device,
+                                                    cmd_buffer_count,
+                                                    cmd_buffers,
+                                                    &utrace_copies);
+   if (!utraces) {
+      *out_flush_data = NULL;
+      return VK_SUCCESS;
+   }
+
+   VkResult result;
+   struct anv_utrace_flush_copy *flush =
+      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_flush_copy),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!flush)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   intel_ds_flush_data_init(&flush->ds, queue->ds, queue->ds->submission_id);
+
+   result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+                           0, 0, &flush->sync);
+   if (result != VK_SUCCESS)
+      goto error_sync;
+
+   if (utrace_copies > 0) {
+      result = anv_bo_pool_alloc(&device->utrace_bo_pool,
+                                 utrace_copies * 4096,
+                                 &flush->trace_bo);
+      if (result != VK_SUCCESS)
+         goto error_trace_buf;
+
+      result = anv_bo_pool_alloc(&device->utrace_bo_pool,
+                                 /* 128 dwords of setup + 64 dwords per copy */
+                                 align_u32(512 + 64 * utrace_copies, 4096),
+                                 &flush->batch_bo);
+      if (result != VK_SUCCESS)
+         goto error_batch_buf;
+
+      result = anv_reloc_list_init(&flush->relocs, &device->vk.alloc);
+      if (result != VK_SUCCESS)
+         goto error_reloc_list;
+
+      flush->batch.alloc = &device->vk.alloc;
+      flush->batch.relocs = &flush->relocs;
+      anv_batch_set_storage(&flush->batch,
+                            (struct anv_address) { .bo = flush->batch_bo, },
+                            flush->batch_bo->map, flush->batch_bo->size);
+
+      /* Emit the copies */
+      anv_genX(device->info, emit_so_memcpy_init)(&flush->memcpy_state,
+                                                   device,
+                                                   &flush->batch);
+      for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+         if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+            u_trace_flush(&cmd_buffers[i]->trace, flush, false);
+         } else {
+            u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
+                                 u_trace_end_iterator(&cmd_buffers[i]->trace),
+                                 &flush->ds.trace,
+                                 flush,
+                                 anv_device_utrace_emit_copy_ts_buffer);
+         }
+      }
+      anv_genX(device->info, emit_so_memcpy_fini)(&flush->memcpy_state);
+
+      u_trace_flush(&flush->ds.trace, flush, true);
+
+      if (flush->batch.status != VK_SUCCESS) {
+         result = flush->batch.status;
+         goto error_batch;
+      }
+   } else {
+      for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+         assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+         u_trace_flush(&cmd_buffers[i]->trace, flush, i == (cmd_buffer_count - 1));
+      }
+   }
+
+   flush->queue = queue;
+
+   *out_flush_data = flush;
+
+   return VK_SUCCESS;
+
+ error_batch:
+   anv_reloc_list_finish(&flush->relocs, &device->vk.alloc);
+ error_reloc_list:
+   anv_bo_pool_free(&device->utrace_bo_pool, flush->batch_bo);
+ error_batch_buf:
+   anv_bo_pool_free(&device->utrace_bo_pool, flush->trace_bo);
+ error_trace_buf:
+   vk_sync_destroy(&device->vk, flush->sync);
+ error_sync:
+   vk_free(&device->vk.alloc, flush);
+   return result;
+}
+
+static void *
+anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+
+   struct anv_bo *bo = NULL;
+   UNUSED VkResult result =
+      anv_bo_pool_alloc(&device->utrace_bo_pool,
+                        align_u32(size_b, 4096),
+                        &bo);
+   assert(result == VK_SUCCESS);
+
+   return bo;
+}
+
+static void
+anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_bo *bo = timestamps;
+
+   anv_bo_pool_free(&device->utrace_bo_pool, bo);
+}
+
+static void
+anv_utrace_record_ts(struct u_trace *ut, void *cs,
+                     void *timestamps, unsigned idx,
+                     bool end_of_pipe)
+{
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(ut, struct anv_cmd_buffer, trace);
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_bo *bo = timestamps;
+
+   device->physical->cmd_emit_timestamp(&cmd_buffer->batch, device,
+                                        (struct anv_address) {
+                                           .bo = bo,
+                                           .offset = idx * sizeof(uint64_t) },
+                                        end_of_pipe);
+}
+
+static uint64_t
+anv_utrace_read_ts(struct u_trace_context *utctx,
+                   void *timestamps, unsigned idx, void *flush_data)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_bo *bo = timestamps;
+   struct anv_utrace_flush_copy *flush = flush_data;
+
+   /* Only need to stall on results for the first entry: */
+   if (idx == 0) {
+      UNUSED VkResult result =
+         vk_sync_wait(&device->vk,
+                      flush->sync,
+                      0,
+                      VK_SYNC_WAIT_COMPLETE,
+                      os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
+      assert(result == VK_SUCCESS);
+   }
+
+   uint64_t *ts = bo->map;
+
+   /* Don't translate the no-timestamp marker: */
+   if (ts[idx] == U_TRACE_NO_TIMESTAMP)
+      return U_TRACE_NO_TIMESTAMP;
+
+   return intel_device_info_timebase_scale(device->info, ts[idx]);
+}
+
+static const char *
+queue_family_to_name(const struct anv_queue_family *family)
+{
+   switch (family->engine_class) {
+   case I915_ENGINE_CLASS_RENDER:
+      return "render";
+   case I915_ENGINE_CLASS_COPY:
+      return "copy";
+   case I915_ENGINE_CLASS_VIDEO:
+      return "video";
+   case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+      return "video-enh";
+   default:
+      return "unknown";
+   }
+}
+
+void
+anv_device_utrace_init(struct anv_device *device)
+{
+   anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace");
+   intel_ds_device_init(&device->ds, device->info, device->fd,
+                        device->physical->local_minor - 128,
+                        INTEL_DS_API_VULKAN);
+   u_trace_context_init(&device->ds.trace_context,
+                        &device->ds,
+                        anv_utrace_create_ts_buffer,
+                        anv_utrace_destroy_ts_buffer,
+                        anv_utrace_record_ts,
+                        anv_utrace_read_ts,
+                        anv_utrace_delete_flush_data);
+
+   for (uint32_t q = 0; q < device->queue_count; q++) {
+      struct anv_queue *queue = &device->queues[q];
+
+      queue->ds =
+         intel_ds_device_add_queue(&device->ds, "%s%u",
+                                   queue_family_to_name(queue->family),
+                                   queue->index_in_family);
+   }
+}
+
+void
+anv_device_utrace_finish(struct anv_device *device)
+{
+   u_trace_context_process(&device->ds.trace_context, true);
+   intel_ds_device_fini(&device->ds);
+   anv_bo_pool_finish(&device->utrace_bo_pool);
+}
+
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
+{
+   static const struct {
+      enum anv_pipe_bits anv;
+      enum intel_ds_stall_flag ds;
+   } anv_to_ds_flags[] = {
+      { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,            .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT,             .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT,             .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,    .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,       .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,    .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT,          .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,     .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_DEPTH_STALL_BIT,                  .ds = INTEL_DS_DEPTH_STALL_BIT, },
+      { .anv = ANV_PIPE_CS_STALL_BIT,                     .ds = INTEL_DS_CS_STALL_BIT, },
+      { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
+      { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
+   };
+
+   enum intel_ds_stall_flag ret = 0;
+   for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
+      if (anv_to_ds_flags[i].anv & bits)
+         ret |= anv_to_ds_flags[i].ds;
+   }
+
+   return ret;
+}
diff --git a/src/intel/vulkan_hasvk/anv_wsi.c b/src/intel/vulkan_hasvk/anv_wsi.c
new file mode 100644
index 00000000000..5e98673e275
--- /dev/null
+++ b/src/intel/vulkan_hasvk/anv_wsi.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "wsi_common.h"
+#include "vk_fence.h"
+#include "vk_queue.h"
+#include "vk_semaphore.h"
+#include "vk_util.h"
+
+static PFN_vkVoidFunction
+anv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName);
+}
+
+VkResult
+anv_init_wsi(struct anv_physical_device *physical_device)
+{
+   VkResult result;
+
+   result = wsi_device_init(&physical_device->wsi_device,
+                            anv_physical_device_to_handle(physical_device),
+                            anv_wsi_proc_addr,
+                            &physical_device->instance->vk.alloc,
+                            physical_device->master_fd,
+                            &physical_device->instance->dri_options,
+                            false);
+   if (result != VK_SUCCESS)
+      return result;
+
+   physical_device->wsi_device.supports_modifiers = true;
+   physical_device->wsi_device.signal_semaphore_with_memory = true;
+   physical_device->wsi_device.signal_fence_with_memory = true;
+
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
+
+   wsi_device_setup_syncobj_fd(&physical_device->wsi_device,
+                               physical_device->local_fd);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_finish_wsi(struct anv_physical_device *physical_device)
+{
+   physical_device->vk.wsi_device = NULL;
+   wsi_device_finish(&physical_device->wsi_device,
+                     &physical_device->instance->vk.alloc);
+}
+
+VkResult anv_AcquireNextImage2KHR(
+   VkDevice _device,
+   const VkAcquireNextImageInfoKHR *pAcquireInfo,
+   uint32_t *pImageIndex)
+{
+   VK_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult result =
+      wsi_common_acquire_next_image2(&device->physical->wsi_device,
+                                     _device, pAcquireInfo, pImageIndex);
+   if (result == VK_SUCCESS)
+      anv_measure_acquire(device);
+
+   return result;
+}
+
+VkResult anv_QueuePresentKHR(
+    VkQueue                                  _queue,
+    const VkPresentInfoKHR*                  pPresentInfo)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   if (device->debug_frame_desc) {
+      device->debug_frame_desc->frame_id++;
+      if (device->physical->memory.need_clflush) {
+         intel_clflush_range(device->debug_frame_desc,
+                           sizeof(*device->debug_frame_desc));
+      }
+   }
+
+   result = vk_queue_wait_before_present(&queue->vk, pPresentInfo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = wsi_common_queue_present(&device->physical->wsi_device,
+                                     anv_device_to_handle(queue->device),
+                                     _queue, 0,
+                                     pPresentInfo);
+
+   u_trace_context_process(&device->ds.trace_context, true);
+
+   return result;
+}
diff --git a/src/intel/vulkan_hasvk/genX_blorp_exec.c b/src/intel/vulkan_hasvk/genX_blorp_exec.c
new file mode 100644
index 00000000000..40582ab9391
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_blorp_exec.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+
+/* These are defined in anv_private.h and blorp_genX_exec.h */
+#undef __gen_address_type
+#undef __gen_user_data
+#undef __gen_combine_address
+
+#include "common/intel_l3_config.h"
+#include "blorp/blorp_genX_exec.h"
+
+#include "ds/intel_tracepoints.h"
+
+static void blorp_measure_start(struct blorp_batch *_batch,
+                                const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+   trace_intel_begin_blorp(&cmd_buffer->trace);
+   anv_measure_snapshot(cmd_buffer,
+                        params->snapshot_type,
+                        NULL, 0);
+}
+
+static void blorp_measure_end(struct blorp_batch *_batch,
+                              const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+   trace_intel_end_blorp(&cmd_buffer->trace,
+                         params->x1 - params->x0,
+                         params->y1 - params->y0,
+                         params->hiz_op,
+                         params->fast_clear_op,
+                         params->shader_type,
+                         params->shader_pipeline);
+}
+
+static void *
+blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   return anv_batch_emit_dwords(&cmd_buffer->batch, n);
+}
+
+static uint64_t
+blorp_emit_reloc(struct blorp_batch *batch,
+                 void *location, struct blorp_address address, uint32_t delta)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->batch.start <= location &&
+          location < cmd_buffer->batch.end);
+   return anv_batch_emit_reloc(&cmd_buffer->batch, location,
+                               address.buffer, address.offset + delta);
+}
+
+static void
+blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
+                    struct blorp_address address, uint32_t delta)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   VkResult result;
+
+   if (ANV_ALWAYS_SOFTPIN) {
+      result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                                     &cmd_buffer->vk.pool->alloc,
+                                     address.buffer);
+      if (unlikely(result != VK_SUCCESS))
+         anv_batch_set_error(&cmd_buffer->batch, result);
+      return;
+   }
+
+   uint64_t address_u64 = 0;
+   result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
+                               &cmd_buffer->vk.pool->alloc,
+                               ss_offset, address.buffer,
+                               address.offset + delta,
+                               &address_u64);
+   if (result != VK_SUCCESS)
+      anv_batch_set_error(&cmd_buffer->batch, result);
+
+   void *dest = anv_block_pool_map(
+      &cmd_buffer->device->surface_state_pool.block_pool, ss_offset, 8);
+   write_reloc(cmd_buffer->device, dest, address_u64, false);
+}
+
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *blorp_batch,
+                          struct blorp_address address)
+{
+   if (ANV_ALWAYS_SOFTPIN) {
+      struct anv_address anv_addr = {
+         .bo = address.buffer,
+         .offset = address.offset,
+      };
+      return anv_address_physical(anv_addr);
+   } else {
+      /* We'll let blorp_surface_reloc write the address. */
+      return 0;
+   }
+}
+
+#if GFX_VER >= 7 && GFX_VER < 10
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *batch)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   return (struct blorp_address) {
+      .buffer = cmd_buffer->device->surface_state_pool.block_pool.bo,
+      .offset = 0,
+   };
+}
+#endif
+
+static void *
+blorp_alloc_dynamic_state(struct blorp_batch *batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment);
+
+   *offset = state.offset;
+   return state.map;
+}
+
+UNUSED static void *
+blorp_alloc_general_state(struct blorp_batch *batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   struct anv_state state =
+      anv_state_stream_alloc(&cmd_buffer->general_state_stream, size,
+                             alignment);
+
+   *offset = state.offset;
+   return state.map;
+}
+
+static void
+blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
+                          unsigned state_size, unsigned state_alignment,
+                          uint32_t *bt_offset,
+                          uint32_t *surface_offsets, void **surface_maps)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   uint32_t state_offset;
+   struct anv_state bt_state;
+
+   VkResult result =
+      anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, num_entries,
+                                               &state_offset, &bt_state);
+   if (result != VK_SUCCESS)
+      return;
+
+   uint32_t *bt_map = bt_state.map;
+   *bt_offset = bt_state.offset;
+
+   for (unsigned i = 0; i < num_entries; i++) {
+      struct anv_state surface_state =
+         anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+      bt_map[i] = surface_state.offset + state_offset;
+      surface_offsets[i] = surface_state.offset;
+      surface_maps[i] = surface_state.map;
+   }
+}
+
+static uint32_t
+blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
+                                      uint32_t offset)
+{
+   return offset;
+}
+
+static void *
+blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
+                          struct blorp_address *addr)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   struct anv_state vb_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
+
+   *addr = (struct blorp_address) {
+      .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+      .offset = vb_state.offset,
+      .mocs = isl_mocs(&cmd_buffer->device->isl_dev,
+                       ISL_SURF_USAGE_VERTEX_BUFFER_BIT, false),
+   };
+
+   return vb_state.map;
+}
+
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           uint32_t *sizes,
+                                           unsigned num_vbs)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   for (unsigned i = 0; i < num_vbs; i++) {
+      struct anv_address anv_addr = {
+         .bo = addrs[i].buffer,
+         .offset = addrs[i].offset,
+      };
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
+                                                     i, anv_addr, sizes[i]);
+   }
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   /* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
+    * really matter for blorp because we never call apply_pipe_flushes after
+    * this point.
+    */
+   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
+                                                       (1 << num_vbs) - 1);
+}
+
+UNUSED static struct blorp_address
+blorp_get_workaround_address(struct blorp_batch *batch)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = cmd_buffer->device->workaround_address.bo,
+      .offset = cmd_buffer->device->workaround_address.offset,
+   };
+}
+
+static void
+blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
+{
+   /* We don't need to flush states anymore, since everything will be snooped.
+    */
+}
+
+static const struct intel_l3_config *
+blorp_get_l3_config(struct blorp_batch *batch)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   return cmd_buffer->state.current_l3_config;
+}
+
+static void
+blorp_exec_on_render(struct blorp_batch *batch,
+                     const struct blorp_params *params)
+{
+   assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT);
+
+   const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
+   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
+                                      params->y1 - params->y0, scale);
+
+#if GFX_VER >= 11
+   /* The PIPE_CONTROL command description says:
+    *
+    *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
+    *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
+    *     Target Cache Flush by enabling this bit. When render target flush
+    *     is set due to new association of BTI, PS Scoreboard Stall bit must
+    *     be set in this packet."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                             "before blorp BTI change");
+#endif
+
+   if (params->depth.enabled &&
+       !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
+      genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   /* Apply any outstanding flushes in case pipeline select haven't. */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+
+   /* BLORP doesn't do anything fancy with depth such as discards, so we want
+    * the PMA fix off.  Also, off is always the safe option.
+    */
+   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
+   blorp_exec(batch, params);
+
+#if GFX_VER >= 11
+   /* The PIPE_CONTROL command description says:
+    *
+    *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
+    *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
+    *     Target Cache Flush by enabling this bit. When render target flush
+    *     is set due to new association of BTI, PS Scoreboard Stall bit must
+    *     be set in this packet."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                             "after blorp BTI change");
+#endif
+
+   /* Calculate state that does not get touched by blorp.
+    * Flush everything else.
+    */
+   anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+                                  ANV_CMD_DIRTY_XFB_ENABLE);
+
+   BITSET_DECLARE(dyn_dirty, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
+   BITSET_ONES(dyn_dirty);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_FSR);
+   BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS);
+   if (!params->wm_prog_data) {
+      BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
+      BITSET_CLEAR(dyn_dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP);
+   }
+
+   cmd_buffer->state.gfx.vb_dirty = ~0;
+   cmd_buffer->state.gfx.dirty |= dirty;
+   BITSET_OR(cmd_buffer->vk.dynamic_graphics_state.dirty,
+             cmd_buffer->vk.dynamic_graphics_state.dirty, dyn_dirty);
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+}
+
+static void
+blorp_exec_on_compute(struct blorp_batch *batch,
+                      const struct blorp_params *params)
+{
+   assert(batch->flags & BLORP_BATCH_USE_COMPUTE);
+
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Apply any outstanding flushes in case pipeline select haven't. */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   blorp_exec(batch, params);
+
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+}
+
+void
+genX(blorp_exec)(struct blorp_batch *batch,
+                 const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   if (!cmd_buffer->state.current_l3_config) {
+      const struct intel_l3_config *cfg =
+         intel_get_default_l3_config(cmd_buffer->device->info);
+      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+   }
+
+#if GFX_VER == 7
+   /* The MI_LOAD/STORE_REGISTER_MEM commands which BLORP uses to implement
+    * indirect fast-clear colors can cause GPU hangs if we don't stall first.
+    * See genX(cmd_buffer_mi_memcpy) for more details.
+    */
+   if (params->src.clear_color_addr.buffer ||
+       params->dst.clear_color_addr.buffer) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CS_STALL_BIT,
+                                "before blorp prep fast clear");
+   }
+#endif
+
+   if (batch->flags & BLORP_BATCH_USE_COMPUTE)
+      blorp_exec_on_compute(batch, params);
+   else
+      blorp_exec_on_render(batch, params);
+}
diff --git a/src/intel/vulkan_hasvk/genX_cmd_buffer.c b/src/intel/vulkan_hasvk/genX_cmd_buffer.c
new file mode 100644
index 00000000000..8c236c2aeba
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_cmd_buffer.c
@@ -0,0 +1,7488 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "vk_format.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+#include "util/fast_idiv_by_const.h"
+
+#include "common/intel_aux_map.h"
+#include "common/intel_l3_config.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/gen_rt_pack.h"
+#include "common/intel_guardband.h"
+#include "compiler/brw_prim.h"
+
+#include "nir/nir_xfb_info.h"
+
+#include "ds/intel_tracepoints.h"
+
+/* We reserve :
+ *    - GPR 14 for secondary command buffer returns
+ *    - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
+                                        uint32_t pipeline);
+
+static enum anv_pipe_bits
+convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
+   enum anv_pipe_bits bits = 0;
+   bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
+   bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
+#if GFX_VERx10 >= 125
+   bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
+#endif
+#if GFX_VER >= 12
+   bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
+   bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
+#endif
+   bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
+   bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
+   bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
+   bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
+   bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
+   bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
+   bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
+   bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
+   bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
+#if GFX_VERx10 == 125
+   bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
+#endif
+   return bits;
+}
+
+#define anv_debug_dump_pc(pc) \
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
+      fputs("pc: emit PC=( ", stderr); \
+      anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
+      fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
+   }
+
+static bool
+is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+   return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
+}
+
+void
+genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
+
+   /* If we are emitting a new state base address we probably need to re-emit
+    * binding tables.
+    */
+   cmd_buffer->state.descriptors_dirty |= ~0;
+
+#if GFX_VERx10 >= 125
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.CommandStreamerStallEnable = true;
+      anv_debug_dump_pc(pc);
+   }
+   anv_batch_emit(
+      &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
+      btpa.BindingTablePoolBaseAddress =
+         anv_cmd_buffer_surface_base_address(cmd_buffer);
+      btpa.BindingTablePoolBufferSize = BINDING_TABLE_POOL_BLOCK_SIZE / 4096;
+      btpa.MOCS = mocs;
+   }
+#else /* GFX_VERx10 < 125 */
+   /* Emit a render target cache flush.
+    *
+    * This isn't documented anywhere in the PRM.  However, it seems to be
+    * necessary prior to changing the surface state base address.  Without
+    * this, we get GPU hangs when using multi-level command buffers which
+    * clear depth, reset state base address, and then go render stuff.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VER >= 12
+      pc.HDCPipelineFlushEnable = true;
+#else
+      pc.DCFlushEnable = true;
+#endif
+      pc.RenderTargetCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
+      anv_debug_dump_pc(pc);
+   }
+
+#if GFX_VERx10 == 120
+   /* Wa_1607854226:
+    *
+    *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
+    *  mode by putting the pipeline temporarily in 3D mode.
+    */
+   uint32_t gfx12_wa_pipeline = cmd_buffer->state.current_pipeline;
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
+      sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.GeneralStateMOCS = mocs;
+      sba.GeneralStateBaseAddressModifyEnable = true;
+
+      sba.StatelessDataPortAccessMOCS = mocs;
+
+      sba.SurfaceStateBaseAddress =
+         anv_cmd_buffer_surface_base_address(cmd_buffer);
+      sba.SurfaceStateMOCS = mocs;
+      sba.SurfaceStateBaseAddressModifyEnable = true;
+
+      sba.DynamicStateBaseAddress =
+         (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
+      sba.DynamicStateMOCS = mocs;
+      sba.DynamicStateBaseAddressModifyEnable = true;
+
+      sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.IndirectObjectMOCS = mocs;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+
+      sba.InstructionBaseAddress =
+         (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
+      sba.InstructionMOCS = mocs;
+      sba.InstructionBaseAddressModifyEnable = true;
+
+#  if (GFX_VER >= 8)
+      /* Broadwell requires that we specify a buffer size for a bunch of
+       * these fields.  However, since we will be growing the BO's live, we
+       * just set them all to the maximum.
+       */
+      sba.GeneralStateBufferSize       = 0xfffff;
+      sba.IndirectObjectBufferSize     = 0xfffff;
+      if (anv_use_relocations(device->physical)) {
+         sba.DynamicStateBufferSize    = 0xfffff;
+         sba.InstructionBufferSize     = 0xfffff;
+      } else {
+         /* With softpin, we use fixed addresses so we actually know how big
+          * our base addresses are.
+          */
+         sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
+         sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
+      }
+      sba.GeneralStateBufferSizeModifyEnable    = true;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
+      sba.DynamicStateBufferSizeModifyEnable    = true;
+      sba.InstructionBuffersizeModifyEnable     = true;
+#  else
+      /* On gfx7, we have upper bounds instead.  According to the docs,
+       * setting an upper bound of zero means that no bounds checking is
+       * performed so, in theory, we should be able to leave them zero.
+       * However, border color is broken and the GPU bounds-checks anyway.
+       * To avoid this and other potential problems, we may as well set it
+       * for everything.
+       */
+      sba.GeneralStateAccessUpperBound =
+         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
+      sba.GeneralStateAccessUpperBoundModifyEnable = true;
+      sba.DynamicStateAccessUpperBound =
+         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
+      sba.DynamicStateAccessUpperBoundModifyEnable = true;
+      sba.InstructionAccessUpperBound =
+         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
+      sba.InstructionAccessUpperBoundModifyEnable = true;
+#  endif
+#  if (GFX_VER >= 9)
+      sba.BindlessSurfaceStateBaseAddress =
+         (struct anv_address) { device->surface_state_pool.block_pool.bo, 0 };
+      sba.BindlessSurfaceStateSize = (1 << 20) - 1;
+      sba.BindlessSurfaceStateMOCS = mocs;
+      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#  endif
+#  if (GFX_VER >= 10)
+      sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.BindlessSamplerStateMOCS = mocs;
+      sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+      sba.BindlessSamplerStateBufferSize = 0;
+#  endif
+#if GFX_VERx10 >= 125
+      sba.L1CacheControl = L1CC_WB;
+#endif
+   }
+
+#if GFX_VERx10 == 120
+   /* Wa_1607854226:
+    *
+    *  Put the pipeline back into its current mode.
+    */
+   if (gfx12_wa_pipeline != UINT32_MAX)
+      genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
+#endif
+
+#endif /* GFX_VERx10 < 125 */
+
+   /* After re-setting the surface state base address, we have to do some
+    * cache flushing so that the sampler engine will pick up the new
+    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
+    * Shared Function > 3D Sampler > State > State Caching (page 96):
+    *
+    *    Coherency with system memory in the state cache, like the texture
+    *    cache is handled partially by software. It is expected that the
+    *    command stream or shader will issue Cache Flush operation or
+    *    Cache_Flush sampler message to ensure that the L1 cache remains
+    *    coherent with system memory.
+    *
+    *    [...]
+    *
+    *    Whenever the value of the Dynamic_State_Base_Addr,
+    *    Surface_State_Base_Addr are altered, the L1 state cache must be
+    *    invalidated to ensure the new surface or sampler state is fetched
+    *    from system memory.
+    *
+    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
+    * which, according the PIPE_CONTROL instruction documentation in the
+    * Broadwell PRM:
+    *
+    *    Setting this bit is independent of any other bit in this packet.
+    *    This bit controls the invalidation of the L1 and L2 state caches
+    *    at the top of the pipe i.e. at the parsing time.
+    *
+    * Unfortunately, experimentation seems to indicate that state cache
+    * invalidation through a PIPE_CONTROL does nothing whatsoever in
+    * regards to surface state and binding tables.  In stead, it seems that
+    * invalidating the texture cache is what is actually needed.
+    *
+    * XXX:  As far as we have been able to determine through
+    * experimentation, shows that flush the texture cache appears to be
+    * sufficient.  The theory here is that all of the sampling/rendering
+    * units cache the binding table in the texture cache.  However, we have
+    * yet to be able to actually confirm this.
+    *
+    * Wa_14013910100:
+    *
+    *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
+    *   or program pipe control with Instruction cache invalidate post
+    *   STATE_BASE_ADDRESS command"
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.TextureCacheInvalidationEnable = true;
+      pc.ConstantCacheInvalidationEnable = true;
+      pc.StateCacheInvalidationEnable = true;
+#if GFX_VERx10 == 125
+      pc.InstructionCacheInvalidateEnable = true;
+#endif
+      anv_debug_dump_pc(pc);
+   }
+}
+
+static void
+add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_state state, struct anv_address addr)
+{
+   VkResult result;
+
+   if (anv_use_relocations(cmd_buffer->device->physical)) {
+      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+      result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
+                                  &cmd_buffer->vk.pool->alloc,
+                                  state.offset + isl_dev->ss.addr_offset,
+                                  addr.bo, addr.offset, NULL);
+   } else {
+      result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                                     &cmd_buffer->vk.pool->alloc,
+                                     addr.bo);
+   }
+
+   if (unlikely(result != VK_SUCCESS))
+      anv_batch_set_error(&cmd_buffer->batch, result);
+}
+
+static void
+add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_surface_state state)
+{
+   const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+
+   assert(!anv_address_is_null(state.address));
+   add_surface_reloc(cmd_buffer, state.state, state.address);
+
+   if (!anv_address_is_null(state.aux_address)) {
+      VkResult result =
+         anv_reloc_list_add(&cmd_buffer->surface_relocs,
+                            &cmd_buffer->vk.pool->alloc,
+                            state.state.offset + isl_dev->ss.aux_addr_offset,
+                            state.aux_address.bo,
+                            state.aux_address.offset,
+                            NULL);
+      if (result != VK_SUCCESS)
+         anv_batch_set_error(&cmd_buffer->batch, result);
+   }
+
+   if (!anv_address_is_null(state.clear_address)) {
+      VkResult result =
+         anv_reloc_list_add(&cmd_buffer->surface_relocs,
+                            &cmd_buffer->vk.pool->alloc,
+                            state.state.offset +
+                            isl_dev->ss.clear_color_state_offset,
+                            state.clear_address.bo,
+                            state.clear_address.offset,
+                            NULL);
+      if (result != VK_SUCCESS)
+         anv_batch_set_error(&cmd_buffer->batch, result);
+   }
+}
+
+static bool
+isl_color_value_requires_conversion(union isl_color_value color,
+                                    const struct isl_surf *surf,
+                                    const struct isl_view *view)
+{
+   if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
+      return false;
+
+   uint32_t surf_pack[4] = { 0, 0, 0, 0 };
+   isl_color_value_pack(&color, surf->format, surf_pack);
+
+   uint32_t view_pack[4] = { 0, 0, 0, 0 };
+   union isl_color_value swiz_color =
+      isl_color_value_swizzle_inv(color, view->swizzle);
+   isl_color_value_pack(&swiz_color, view->format, view_pack);
+
+   return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
+}
+
+static bool
+anv_can_fast_clear_color_view(struct anv_device * device,
+                              struct anv_image_view *iview,
+                              VkImageLayout layout,
+                              union isl_color_value clear_color,
+                              uint32_t num_layers,
+                              VkRect2D render_area)
+{
+   if (iview->planes[0].isl.base_array_layer >=
+       anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
+                            iview->planes[0].isl.base_level))
+      return false;
+
+   /* Start by getting the fast clear type.  We use the first subpass
+    * layout here because we don't want to fast-clear if the first subpass
+    * to use the attachment can't handle fast-clears.
+    */
+   enum anv_fast_clear_type fast_clear_type =
+      anv_layout_to_fast_clear_type(device->info, iview->image,
+                                    VK_IMAGE_ASPECT_COLOR_BIT,
+                                    layout);
+   switch (fast_clear_type) {
+   case ANV_FAST_CLEAR_NONE:
+      return false;
+   case ANV_FAST_CLEAR_DEFAULT_VALUE:
+      if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
+         return false;
+      break;
+   case ANV_FAST_CLEAR_ANY:
+      break;
+   }
+
+   /* Potentially, we could do partial fast-clears but doing so has crazy
+    * alignment restrictions.  It's easier to just restrict to full size
+    * fast clears for now.
+    */
+   if (render_area.offset.x != 0 ||
+       render_area.offset.y != 0 ||
+       render_area.extent.width != iview->vk.extent.width ||
+       render_area.extent.height != iview->vk.extent.height)
+      return false;
+
+   /* On Broadwell and earlier, we can only handle 0/1 clear colors */
+   if (GFX_VER <= 8 &&
+       !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
+      return false;
+
+   /* If the clear color is one that would require non-trivial format
+    * conversion on resolve, we don't bother with the fast clear.  This
+    * shouldn't be common as most clear colors are 0/1 and the most common
+    * format re-interpretation is for sRGB.
+    */
+   if (isl_color_value_requires_conversion(clear_color,
+                                           &iview->image->planes[0].primary_surface.isl,
+                                           &iview->planes[0].isl)) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
+                    "Cannot fast-clear to colors which would require "
+                    "format conversion on resolve");
+      return false;
+   }
+
+   /* We only allow fast clears to the first slice of an image (level 0,
+    * layer 0) and only for the entire slice.  This guarantees us that, at
+    * any given time, there is only one clear color on any given image at
+    * any given time.  At the time of our testing (Jan 17, 2018), there
+    * were no known applications which would benefit from fast-clearing
+    * more than just the first slice.
+    */
+   if (iview->planes[0].isl.base_level > 0 ||
+       iview->planes[0].isl.base_array_layer > 0) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+                    "Rendering with multi-lod or multi-layer framebuffer "
+                    "with LOAD_OP_LOAD and baseMipLevel > 0 or "
+                    "baseArrayLayer > 0.  Not fast clearing.");
+      return false;
+   }
+
+   if (num_layers > 1) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+                    "Rendering to a multi-layer framebuffer with "
+                    "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
+   }
+
+   return true;
+}
+
+static bool
+anv_can_hiz_clear_ds_view(struct anv_device *device,
+                          const struct anv_image_view *iview,
+                          VkImageLayout layout,
+                          VkImageAspectFlags clear_aspects,
+                          float depth_clear_value,
+                          VkRect2D render_area)
+{
+   /* We don't do any HiZ or depth fast-clears on gfx7 yet */
+   if (GFX_VER == 7)
+      return false;
+
+   /* If we're just clearing stencil, we can always HiZ clear */
+   if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+      return true;
+
+   /* We must have depth in order to have HiZ */
+   if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+      return false;
+
+   const enum isl_aux_usage clear_aux_usage =
+      anv_layout_to_aux_usage(device->info, iview->image,
+                              VK_IMAGE_ASPECT_DEPTH_BIT,
+                              VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                              layout);
+   if (!blorp_can_hiz_clear_depth(device->info,
+                                  &iview->image->planes[0].primary_surface.isl,
+                                  clear_aux_usage,
+                                  iview->planes[0].isl.base_level,
+                                  iview->planes[0].isl.base_array_layer,
+                                  render_area.offset.x,
+                                  render_area.offset.y,
+                                  render_area.offset.x +
+                                  render_area.extent.width,
+                                  render_area.offset.y +
+                                  render_area.extent.height))
+      return false;
+
+   if (depth_clear_value != ANV_HZ_FC_VAL)
+      return false;
+
+   /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
+    * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
+    * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
+    */
+   if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image))
+      return false;
+
+   /* If we got here, then we can fast clear */
+   return true;
+}
+
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+#if GFX_VER == 12
+static void
+anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
+                      const struct anv_image *image,
+                      VkImageAspectFlagBits aspect,
+                      uint32_t base_level, uint32_t level_count,
+                      uint32_t base_layer, uint32_t layer_count)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   const struct anv_surface *surface = &image->planes[plane].primary_surface;
+   uint64_t base_address =
+      anv_address_physical(anv_image_address(image, &surface->memory_range));
+
+   const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
+   uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
+
+   /* We're about to live-update the AUX-TT.  We really don't want anyone else
+    * trying to read it while we're doing this.  We could probably get away
+    * with not having this stall in some cases if we were really careful but
+    * it's better to play it safe.  Full stall the GPU.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "before update AUX-TT");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t a = 0; a < layer_count; a++) {
+      const uint32_t layer = base_layer + a;
+
+      uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = base_level + l;
+
+         uint32_t logical_array_layer, logical_z_offset_px;
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+            logical_array_layer = 0;
+
+            /* If the given miplevel does not have this layer, then any higher
+             * miplevels won't either because miplevels only get smaller the
+             * higher the LOD.
+             */
+            assert(layer < image->vk.extent.depth);
+            if (layer >= anv_minify(image->vk.extent.depth, level))
+               break;
+            logical_z_offset_px = layer;
+         } else {
+            assert(layer < image->vk.array_layers);
+            logical_array_layer = layer;
+            logical_z_offset_px = 0;
+         }
+
+         uint64_t slice_start_offset_B, slice_end_offset_B;
+         isl_surf_get_image_range_B_tile(isl_surf, level,
+                                         logical_array_layer,
+                                         logical_z_offset_px,
+                                         &slice_start_offset_B,
+                                         &slice_end_offset_B);
+
+         start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
+         end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
+      }
+
+      /* Aux operates 64K at a time */
+      start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
+      end_offset_B = align_u64(end_offset_B, 64 * 1024);
+
+      for (uint64_t offset = start_offset_B;
+           offset < end_offset_B; offset += 64 * 1024) {
+         uint64_t address = base_address + offset;
+
+         uint64_t aux_entry_addr64, *aux_entry_map;
+         aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
+                                                 address, &aux_entry_addr64);
+
+         assert(!anv_use_relocations(cmd_buffer->device->physical));
+         struct anv_address aux_entry_address = {
+            .bo = NULL,
+            .offset = aux_entry_addr64,
+         };
+
+         const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
+         uint64_t new_aux_entry =
+            (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
+
+         if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
+            new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
+
+         mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
+      }
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                             "after update AUX-TT");
+}
+#endif /* GFX_VER == 12 */
+
+/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
+ * the initial layout is undefined, the HiZ buffer and depth buffer will
+ * represent the same data at the end of this operation.
+ */
+static void
+transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
+                        const struct anv_image *image,
+                        uint32_t base_layer, uint32_t layer_count,
+                        VkImageLayout initial_layout,
+                        VkImageLayout final_layout,
+                        bool will_full_fast_clear)
+{
+   const uint32_t depth_plane =
+      anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
+   if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
+      return;
+
+#if GFX_VER == 12
+   if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
+        initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
+       cmd_buffer->device->physical->has_implicit_ccs &&
+       cmd_buffer->device->info->has_aux_map) {
+      anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                            0, 1, base_layer, layer_count);
+   }
+#endif
+
+   /* If will_full_fast_clear is set, the caller promises to fast-clear the
+    * largest portion of the specified range as it can.  For depth images,
+    * that means the entire image because we don't support multi-LOD HiZ.
+    */
+   assert(image->planes[0].primary_surface.isl.levels == 1);
+   if (will_full_fast_clear)
+      return;
+
+   const enum isl_aux_state initial_state =
+      anv_layout_to_aux_state(cmd_buffer->device->info, image,
+                              VK_IMAGE_ASPECT_DEPTH_BIT,
+                              initial_layout);
+   const enum isl_aux_state final_state =
+      anv_layout_to_aux_state(cmd_buffer->device->info, image,
+                              VK_IMAGE_ASPECT_DEPTH_BIT,
+                              final_layout);
+
+   const bool initial_depth_valid =
+      isl_aux_state_has_valid_primary(initial_state);
+   const bool initial_hiz_valid =
+      isl_aux_state_has_valid_aux(initial_state);
+   const bool final_needs_depth =
+      isl_aux_state_has_valid_primary(final_state);
+   const bool final_needs_hiz =
+      isl_aux_state_has_valid_aux(final_state);
+
+   /* Getting into the pass-through state for Depth is tricky and involves
+    * both a resolve and an ambiguate.  We don't handle that state right now
+    * as anv_layout_to_aux_state never returns it.
+    */
+   assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
+
+   if (final_needs_depth && !initial_depth_valid) {
+      assert(initial_hiz_valid);
+      anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                       0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
+   } else if (final_needs_hiz && !initial_hiz_valid) {
+      assert(initial_depth_valid);
+      anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                       0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
+   }
+}
+
+#if GFX_VER == 7
+static inline bool
+vk_image_layout_stencil_write_optimal(VkImageLayout layout)
+{
+   return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
+          layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
+          layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL;
+}
+#endif
+
+/* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
+ * the initial layout is undefined, the HiZ buffer and depth buffer will
+ * represent the same data at the end of this operation.
+ */
+static void
+transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
+                          const struct anv_image *image,
+                          uint32_t base_level, uint32_t level_count,
+                          uint32_t base_layer, uint32_t layer_count,
+                          VkImageLayout initial_layout,
+                          VkImageLayout final_layout,
+                          bool will_full_fast_clear)
+{
+#if GFX_VER == 7
+   const uint32_t plane =
+      anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+
+   /* On gfx7, we have to store a texturable version of the stencil buffer in
+    * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
+    * forth at strategic points. Stencil writes are only allowed in following
+    * layouts:
+    *
+    *  - VK_IMAGE_LAYOUT_GENERAL
+    *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
+    *
+    * For general, we have no nice opportunity to transition so we do the copy
+    * to the shadow unconditionally at the end of the subpass. For transfer
+    * destinations, we can update it as part of the transfer op. For the other
+    * layouts, we delay the copy until a transition into some other layout.
+    */
+   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
+       vk_image_layout_stencil_write_optimal(initial_layout) &&
+       !vk_image_layout_stencil_write_optimal(final_layout)) {
+      anv_image_copy_to_shadow(cmd_buffer, image,
+                               VK_IMAGE_ASPECT_STENCIL_BIT,
+                               base_level, level_count,
+                               base_layer, layer_count);
+   }
+#elif GFX_VER == 12
+   const uint32_t plane =
+      anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
+      return;
+
+   if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
+        initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
+       cmd_buffer->device->physical->has_implicit_ccs &&
+       cmd_buffer->device->info->has_aux_map) {
+      anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                            base_level, level_count, base_layer, layer_count);
+
+      /* If will_full_fast_clear is set, the caller promises to fast-clear the
+       * largest portion of the specified range as it can.
+       */
+      if (will_full_fast_clear)
+         return;
+
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = base_level + l;
+         const VkRect2D clear_rect = {
+            .offset.x = 0,
+            .offset.y = 0,
+            .extent.width = anv_minify(image->vk.extent.width, level),
+            .extent.height = anv_minify(image->vk.extent.height, level),
+         };
+
+         uint32_t aux_layers =
+            anv_image_aux_layers(image, VK_IMAGE_ASPECT_STENCIL_BIT, level);
+         uint32_t level_layer_count =
+            MIN2(layer_count, aux_layers - base_layer);
+
+         /* From Bspec's 3DSTATE_STENCIL_BUFFER_BODY > Stencil Compression
+          * Enable:
+          *
+          *    "When enabled, Stencil Buffer needs to be initialized via
+          *    stencil clear (HZ_OP) before any renderpass."
+          */
+         anv_image_hiz_clear(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                             level, base_layer, level_layer_count,
+                             clear_rect, 0 /* Stencil clear value */);
+      }
+   }
+#endif
+}
+
+#define MI_PREDICATE_SRC0    0x2400
+#define MI_PREDICATE_SRC1    0x2408
+#define MI_PREDICATE_RESULT  0x2418
+
+static void
+set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
+                         const struct anv_image *image,
+                         VkImageAspectFlagBits aspect,
+                         uint32_t level,
+                         uint32_t base_layer, uint32_t layer_count,
+                         bool compressed)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   /* We only have compression tracking for CCS_E */
+   if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
+      return;
+
+   for (uint32_t a = 0; a < layer_count; a++) {
+      uint32_t layer = base_layer + a;
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = anv_image_get_compression_state_addr(cmd_buffer->device,
+                                                            image, aspect,
+                                                            level, layer);
+         sdi.ImmediateData = compressed ? UINT32_MAX : 0;
+      }
+   }
+}
+
+static void
+set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
+                           const struct anv_image *image,
+                           VkImageAspectFlagBits aspect,
+                           enum anv_fast_clear_type fast_clear)
+{
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+      sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+                                                       image, aspect);
+      sdi.ImmediateData = fast_clear;
+   }
+
+   /* Whenever we have fast-clear, we consider that slice to be compressed.
+    * This makes building predicates much easier.
+    */
+   if (fast_clear != ANV_FAST_CLEAR_NONE)
+      set_image_compressed_bit(cmd_buffer, image, aspect, 0, 0, 1, true);
+}
+
+/* This is only really practical on haswell and above because it requires
+ * MI math in order to get it correct.
+ */
+#if GFX_VERx10 >= 75
+static void
+anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
+                                  const struct anv_image *image,
+                                  VkImageAspectFlagBits aspect,
+                                  uint32_t level, uint32_t array_layer,
+                                  enum isl_aux_op resolve_op,
+                                  enum anv_fast_clear_type fast_clear_supported)
+{
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   const struct mi_value fast_clear_type =
+      mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+                                                  image, aspect));
+
+   if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
+      /* In this case, we're doing a full resolve which means we want the
+       * resolve to happen if any compression (including fast-clears) is
+       * present.
+       *
+       * In order to simplify the logic a bit, we make the assumption that,
+       * if the first slice has been fast-cleared, it is also marked as
+       * compressed.  See also set_image_fast_clear_state.
+       */
+      const struct mi_value compression_state =
+         mi_mem32(anv_image_get_compression_state_addr(cmd_buffer->device,
+                                                       image, aspect,
+                                                       level, array_layer));
+      mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), compression_state);
+      mi_store(&b, compression_state, mi_imm(0));
+
+      if (level == 0 && array_layer == 0) {
+         /* If the predicate is true, we want to write 0 to the fast clear type
+          * and, if it's false, leave it alone.  We can do this by writing
+          *
+          * clear_type = clear_type & ~predicate;
+          */
+         struct mi_value new_fast_clear_type =
+            mi_iand(&b, fast_clear_type,
+                        mi_inot(&b, mi_reg64(MI_PREDICATE_SRC0)));
+         mi_store(&b, fast_clear_type, new_fast_clear_type);
+      }
+   } else if (level == 0 && array_layer == 0) {
+      /* In this case, we are doing a partial resolve to get rid of fast-clear
+       * colors.  We don't care about the compression state but we do care
+       * about how much fast clear is allowed by the final layout.
+       */
+      assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+      assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
+
+      /* We need to compute (fast_clear_supported < image->fast_clear) */
+      struct mi_value pred =
+         mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
+      mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
+
+      /* If the predicate is true, we want to write 0 to the fast clear type
+       * and, if it's false, leave it alone.  We can do this by writing
+       *
+       * clear_type = clear_type & ~predicate;
+       */
+      struct mi_value new_fast_clear_type =
+         mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
+      mi_store(&b, fast_clear_type, new_fast_clear_type);
+   } else {
+      /* In this case, we're trying to do a partial resolve on a slice that
+       * doesn't have clear color.  There's nothing to do.
+       */
+      assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+      return;
+   }
+
+   /* Set src1 to 0 and use a != condition */
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+}
+#endif /* GFX_VERx10 >= 75 */
+
+#if GFX_VER <= 8
+static void
+anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
+                                 const struct anv_image *image,
+                                 VkImageAspectFlagBits aspect,
+                                 uint32_t level, uint32_t array_layer,
+                                 enum isl_aux_op resolve_op,
+                                 enum anv_fast_clear_type fast_clear_supported)
+{
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   struct mi_value fast_clear_type_mem =
+      mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+                                                      image, aspect));
+
+   /* This only works for partial resolves and only when the clear color is
+    * all or nothing.  On the upside, this emits less command streamer code
+    * and works on Ivybridge and Bay Trail.
+    */
+   assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+   assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
+
+   /* We don't support fast clears on anything other than the first slice. */
+   if (level > 0 || array_layer > 0)
+      return;
+
+   /* On gfx8, we don't have a concept of default clear colors because we
+    * can't sample from CCS surfaces.  It's enough to just load the fast clear
+    * state into the predicate register.
+    */
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+   mi_store(&b, fast_clear_type_mem, mi_imm(0));
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+}
+#endif /* GFX_VER <= 8 */
+
+static void
+anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
+                               const struct anv_image *image,
+                               enum isl_format format,
+                               struct isl_swizzle swizzle,
+                               VkImageAspectFlagBits aspect,
+                               uint32_t level, uint32_t array_layer,
+                               enum isl_aux_op resolve_op,
+                               enum anv_fast_clear_type fast_clear_supported)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+#if GFX_VER >= 9
+   anv_cmd_compute_resolve_predicate(cmd_buffer, image,
+                                     aspect, level, array_layer,
+                                     resolve_op, fast_clear_supported);
+#else /* GFX_VER <= 8 */
+   anv_cmd_simple_resolve_predicate(cmd_buffer, image,
+                                    aspect, level, array_layer,
+                                    resolve_op, fast_clear_supported);
+#endif
+
+   /* CCS_D only supports full resolves and BLORP will assert on us if we try
+    * to do a partial resolve on a CCS_D surface.
+    */
+   if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
+       image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
+      resolve_op = ISL_AUX_OP_FULL_RESOLVE;
+
+   anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
+                    level, array_layer, 1, resolve_op, NULL, true);
+}
+
+static void
+anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
+                               const struct anv_image *image,
+                               enum isl_format format,
+                               struct isl_swizzle swizzle,
+                               VkImageAspectFlagBits aspect,
+                               uint32_t array_layer,
+                               enum isl_aux_op resolve_op,
+                               enum anv_fast_clear_type fast_clear_supported)
+{
+   assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+
+#if GFX_VERx10 >= 75
+   anv_cmd_compute_resolve_predicate(cmd_buffer, image,
+                                     aspect, 0, array_layer,
+                                     resolve_op, fast_clear_supported);
+
+   anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
+                    array_layer, 1, resolve_op, NULL, true);
+#else
+   unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
+#endif
+}
+
+void
+genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
+                                    const struct anv_image *image,
+                                    VkImageAspectFlagBits aspect,
+                                    enum isl_aux_usage aux_usage,
+                                    uint32_t level,
+                                    uint32_t base_layer,
+                                    uint32_t layer_count)
+{
+   /* The aspect must be exactly one of the image aspects. */
+   assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
+
+   /* The only compression types with more than just fast-clears are MCS,
+    * CCS_E, and HiZ.  With HiZ we just trust the layout and don't actually
+    * track the current fast-clear and compression state.  This leaves us
+    * with just MCS and CCS_E.
+    */
+   if (aux_usage != ISL_AUX_USAGE_CCS_E &&
+       aux_usage != ISL_AUX_USAGE_MCS)
+      return;
+
+   set_image_compressed_bit(cmd_buffer, image, aspect,
+                            level, base_layer, layer_count, true);
+}
+
+static void
+init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
+                      const struct anv_image *image,
+                      VkImageAspectFlagBits aspect)
+{
+   assert(cmd_buffer && image);
+   assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+
+   set_image_fast_clear_state(cmd_buffer, image, aspect,
+                              ANV_FAST_CLEAR_NONE);
+
+   /* Initialize the struct fields that are accessed for fast-clears so that
+    * the HW restrictions on the field values are satisfied.
+    */
+   struct anv_address addr =
+      anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+
+   if (GFX_VER >= 9) {
+      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+      const unsigned num_dwords = GFX_VER >= 10 ?
+                                  isl_dev->ss.clear_color_state_size / 4 :
+                                  isl_dev->ss.clear_value_size / 4;
+      for (unsigned i = 0; i < num_dwords; i++) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+            sdi.Address = addr;
+            sdi.Address.offset += i * 4;
+            sdi.ImmediateData = 0;
+         }
+      }
+   } else {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = addr;
+         if (GFX_VERx10 >= 75) {
+            /* Pre-SKL, the dword containing the clear values also contains
+             * other fields, so we need to initialize those fields to match the
+             * values that would be in a color attachment.
+             */
+            sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
+                                ISL_CHANNEL_SELECT_GREEN << 22 |
+                                ISL_CHANNEL_SELECT_BLUE  << 19 |
+                                ISL_CHANNEL_SELECT_ALPHA << 16;
+         } else if (GFX_VER == 7) {
+            /* On IVB, the dword containing the clear values also contains
+             * other fields that must be zero or can be zero.
+             */
+            sdi.ImmediateData = 0;
+         }
+      }
+   }
+}
+
+/* Copy the fast-clear value dword(s) between a surface state object and an
+ * image's fast clear state buffer.
+ */
+static void
+genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_state surface_state,
+                             const struct anv_image *image,
+                             VkImageAspectFlagBits aspect,
+                             bool copy_from_surface_state)
+{
+   assert(cmd_buffer && image);
+   assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+
+   struct anv_address ss_clear_addr = {
+      .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
+      .offset = surface_state.offset +
+                cmd_buffer->device->isl_dev.ss.clear_value_offset,
+   };
+   const struct anv_address entry_addr =
+      anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+   unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
+
+#if GFX_VER == 7
+   /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
+    * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
+    * in-flight when they are issued even if the memory touched is not
+    * currently active for rendering.  The weird bit is that it is not the
+    * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
+    * rendering hangs such that the next stalling command after the
+    * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
+    *
+    * It is unclear exactly why this hang occurs.  Both MI commands come with
+    * warnings about the 3D pipeline but that doesn't seem to fully explain
+    * it.  My (Jason's) best theory is that it has something to do with the
+    * fact that we're using a GPU state register as our temporary and that
+    * something with reading/writing it is causing problems.
+    *
+    * In order to work around this issue, we emit a PIPE_CONTROL with the
+    * command streamer stall bit set.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_CS_STALL_BIT,
+                             "after copy_fast_clear_dwords. Avoid potential hang");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   if (copy_from_surface_state) {
+      mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
+   } else {
+      mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
+
+      /* Updating a surface state object may require that the state cache be
+       * invalidated. From the SKL PRM, Shared Functions -> State -> State
+       * Caching:
+       *
+       *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+       *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+       *    modified [...], the L1 state cache must be invalidated to ensure
+       *    the new surface or sampler state is fetched from system memory.
+       *
+       * In testing, SKL doesn't actually seem to need this, but HSW does.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
+                                "after copy_fast_clear_dwords surface state update");
+   }
+}
+
+/**
+ * @brief Transitions a color buffer from one layout to another.
+ *
+ * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
+ * more information.
+ *
+ * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
+ * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
+ *                    this represents the maximum layers to transition at each
+ *                    specified miplevel.
+ */
+static void
+transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
+                        const struct anv_image *image,
+                        VkImageAspectFlagBits aspect,
+                        const uint32_t base_level, uint32_t level_count,
+                        uint32_t base_layer, uint32_t layer_count,
+                        VkImageLayout initial_layout,
+                        VkImageLayout final_layout,
+                        uint64_t src_queue_family,
+                        uint64_t dst_queue_family,
+                        bool will_full_fast_clear)
+{
+   struct anv_device *device = cmd_buffer->device;
+   const struct intel_device_info *devinfo = device->info;
+   /* Validate the inputs. */
+   assert(cmd_buffer);
+   assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+   /* These values aren't supported for simplicity's sake. */
+   assert(level_count != VK_REMAINING_MIP_LEVELS &&
+          layer_count != VK_REMAINING_ARRAY_LAYERS);
+   /* Ensure the subresource range is valid. */
+   UNUSED uint64_t last_level_num = base_level + level_count;
+   const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
+   UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
+   assert((uint64_t)base_layer + layer_count  <= image_layers);
+   assert(last_level_num <= image->vk.mip_levels);
+   /* If there is a layout transfer, the final layout cannot be undefined or
+    * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
+    */
+   assert(initial_layout == final_layout ||
+          (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
+           final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
+   const struct isl_drm_modifier_info *isl_mod_info =
+      image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
+      ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
+      : NULL;
+
+   const bool src_queue_external =
+      src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
+      src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
+
+   const bool dst_queue_external =
+      dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
+      dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
+
+   /* Simultaneous acquire and release on external queues is illegal. */
+   assert(!src_queue_external || !dst_queue_external);
+
+   /* Ownership transition on an external queue requires special action if the
+    * image has a DRM format modifier because we store image data in
+    * a driver-private bo which is inaccessible to the external queue.
+    */
+   const bool private_binding_acquire =
+      src_queue_external &&
+      anv_image_is_externally_shared(image) &&
+      anv_image_has_private_binding(image);
+
+   const bool private_binding_release =
+      dst_queue_external &&
+      anv_image_is_externally_shared(image) &&
+      anv_image_has_private_binding(image);
+
+   if (initial_layout == final_layout &&
+       !private_binding_acquire && !private_binding_release) {
+      /* No work is needed. */
+       return;
+   }
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
+       final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
+      /* This surface is a linear compressed image with a tiled shadow surface
+       * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
+       * we need to ensure the shadow copy is up-to-date.
+       */
+      assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
+      assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+      assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
+      assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
+      assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
+      assert(plane == 0);
+      anv_image_copy_to_shadow(cmd_buffer, image,
+                               VK_IMAGE_ASPECT_COLOR_BIT,
+                               base_level, level_count,
+                               base_layer, layer_count);
+   }
+
+   if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
+      return;
+
+   assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
+
+   /* The following layouts are equivalent for non-linear images. */
+   const bool initial_layout_undefined =
+      initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
+      initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
+
+   bool must_init_fast_clear_state = false;
+   bool must_init_aux_surface = false;
+
+   if (initial_layout_undefined) {
+      /* The subresource may have been aliased and populated with arbitrary
+       * data.
+       */
+      must_init_fast_clear_state = true;
+      must_init_aux_surface = true;
+   } else if (private_binding_acquire) {
+      /* The fast clear state lives in a driver-private bo, and therefore the
+       * external/foreign queue is unaware of it.
+       *
+       * If this is the first time we are accessing the image, then the fast
+       * clear state is uninitialized.
+       *
+       * If this is NOT the first time we are accessing the image, then the fast
+       * clear state may still be valid and correct due to the resolve during
+       * our most recent ownership release.  However, we do not track the aux
+       * state with MI stores, and therefore must assume the worst-case: that
+       * this is the first time we are accessing the image.
+       */
+      assert(image->planes[plane].fast_clear_memory_range.binding ==
+              ANV_IMAGE_MEMORY_BINDING_PRIVATE);
+      must_init_fast_clear_state = true;
+
+      if (image->planes[plane].aux_surface.memory_range.binding ==
+          ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
+         assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
+
+         /* The aux surface, like the fast clear state, lives in
+          * a driver-private bo.  We must initialize the aux surface for the
+          * same reasons we must initialize the fast clear state.
+          */
+         must_init_aux_surface = true;
+      } else {
+         assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
+
+         /* The aux surface, unlike the fast clear state, lives in
+          * application-visible VkDeviceMemory and is shared with the
+          * external/foreign queue. Therefore, when we acquire ownership of the
+          * image with a defined VkImageLayout, the aux surface is valid and has
+          * the aux state required by the modifier.
+          */
+         must_init_aux_surface = false;
+      }
+   }
+
+#if GFX_VER == 12
+   if (initial_layout_undefined) {
+      if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
+         anv_image_init_aux_tt(cmd_buffer, image, aspect,
+                               base_level, level_count,
+                               base_layer, layer_count);
+      }
+   }
+#else
+   assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
+#endif
+
+   if (must_init_fast_clear_state) {
+      if (base_level == 0 && base_layer == 0)
+         init_fast_clear_color(cmd_buffer, image, aspect);
+   }
+
+   if (must_init_aux_surface) {
+      assert(must_init_fast_clear_state);
+
+      /* Initialize the aux buffers to enable correct rendering.  In order to
+       * ensure that things such as storage images work correctly, aux buffers
+       * need to be initialized to valid data.
+       *
+       * Having an aux buffer with invalid data is a problem for two reasons:
+       *
+       *  1) Having an invalid value in the buffer can confuse the hardware.
+       *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
+       *     invalid and leads to the hardware doing strange things.  It
+       *     doesn't hang as far as we can tell but rendering corruption can
+       *     occur.
+       *
+       *  2) If this transition is into the GENERAL layout and we then use the
+       *     image as a storage image, then we must have the aux buffer in the
+       *     pass-through state so that, if we then go to texture from the
+       *     image, we get the results of our storage image writes and not the
+       *     fast clear color or other random data.
+       *
+       * For CCS both of the problems above are real demonstrable issues.  In
+       * that case, the only thing we can do is to perform an ambiguate to
+       * transition the aux surface into the pass-through state.
+       *
+       * For MCS, (2) is never an issue because we don't support multisampled
+       * storage images.  In theory, issue (1) is a problem with MCS but we've
+       * never seen it in the wild.  For 4x and 16x, all bit patters could, in
+       * theory, be interpreted as something but we don't know that all bit
+       * patterns are actually valid.  For 2x and 8x, you could easily end up
+       * with the MCS referring to an invalid plane because not all bits of
+       * the MCS value are actually used.  Even though we've never seen issues
+       * in the wild, it's best to play it safe and initialize the MCS.  We
+       * can use a fast-clear for MCS because we only ever touch from render
+       * and texture (no image load store).
+       */
+      if (image->vk.samples == 1) {
+         for (uint32_t l = 0; l < level_count; l++) {
+            const uint32_t level = base_level + l;
+
+            uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
+            if (base_layer >= aux_layers)
+               break; /* We will only get fewer layers as level increases */
+            uint32_t level_layer_count =
+               MIN2(layer_count, aux_layers - base_layer);
+
+            /* If will_full_fast_clear is set, the caller promises to
+             * fast-clear the largest portion of the specified range as it can.
+             * For color images, that means only the first LOD and array slice.
+             */
+            if (level == 0 && base_layer == 0 && will_full_fast_clear) {
+               base_layer++;
+               level_layer_count--;
+               if (level_layer_count == 0)
+                  continue;
+            }
+
+            anv_image_ccs_op(cmd_buffer, image,
+                             image->planes[plane].primary_surface.isl.format,
+                             ISL_SWIZZLE_IDENTITY,
+                             aspect, level, base_layer, level_layer_count,
+                             ISL_AUX_OP_AMBIGUATE, NULL, false);
+
+            if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+               set_image_compressed_bit(cmd_buffer, image, aspect,
+                                        level, base_layer, level_layer_count,
+                                        false);
+            }
+         }
+      } else {
+         if (image->vk.samples == 4 || image->vk.samples == 16) {
+            anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+                          "Doing a potentially unnecessary fast-clear to "
+                          "define an MCS buffer.");
+         }
+
+         /* If will_full_fast_clear is set, the caller promises to fast-clear
+          * the largest portion of the specified range as it can.
+          */
+         if (will_full_fast_clear)
+            return;
+
+         assert(base_level == 0 && level_count == 1);
+         anv_image_mcs_op(cmd_buffer, image,
+                          image->planes[plane].primary_surface.isl.format,
+                          ISL_SWIZZLE_IDENTITY,
+                          aspect, base_layer, layer_count,
+                          ISL_AUX_OP_FAST_CLEAR, NULL, false);
+      }
+      return;
+   }
+
+   enum isl_aux_usage initial_aux_usage =
+      anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
+   enum isl_aux_usage final_aux_usage =
+      anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
+   enum anv_fast_clear_type initial_fast_clear =
+      anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
+   enum anv_fast_clear_type final_fast_clear =
+      anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
+
+   /* We must override the anv_layout_to_* functions because they are unaware of
+    * acquire/release direction.
+    */
+   if (private_binding_acquire) {
+      initial_aux_usage = isl_mod_info->aux_usage;
+      initial_fast_clear = isl_mod_info->supports_clear_color ?
+         initial_fast_clear : ANV_FAST_CLEAR_NONE;
+   } else if (private_binding_release) {
+      final_aux_usage = isl_mod_info->aux_usage;
+      final_fast_clear = isl_mod_info->supports_clear_color ?
+         final_fast_clear : ANV_FAST_CLEAR_NONE;
+   }
+
+   /* The current code assumes that there is no mixing of CCS_E and CCS_D.
+    * We can handle transitions between CCS_D/E to and from NONE.  What we
+    * don't yet handle is switching between CCS_E and CCS_D within a given
+    * image.  Doing so in a performant way requires more detailed aux state
+    * tracking such as what is done in i965.  For now, just assume that we
+    * only have one type of compression.
+    */
+   assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
+          final_aux_usage == ISL_AUX_USAGE_NONE ||
+          initial_aux_usage == final_aux_usage);
+
+   /* If initial aux usage is NONE, there is nothing to resolve */
+   if (initial_aux_usage == ISL_AUX_USAGE_NONE)
+      return;
+
+   enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
+
+   /* If the initial layout supports more fast clear than the final layout
+    * then we need at least a partial resolve.
+    */
+   if (final_fast_clear < initial_fast_clear)
+      resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
+
+   if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
+       final_aux_usage != ISL_AUX_USAGE_CCS_E)
+      resolve_op = ISL_AUX_OP_FULL_RESOLVE;
+
+   if (resolve_op == ISL_AUX_OP_NONE)
+      return;
+
+   /* Perform a resolve to synchronize data between the main and aux buffer.
+    * Before we begin, we must satisfy the cache flushing requirement specified
+    * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
+    *
+    *    Any transition from any value in {Clear, Render, Resolve} to a
+    *    different value in {Clear, Render, Resolve} requires end of pipe
+    *    synchronization.
+    *
+    * We perform a flush of the write cache before and after the clear and
+    * resolve operations to meet this requirement.
+    *
+    * Unlike other drawing, fast clear operations are not properly
+    * synchronized. The first PIPE_CONTROL here likely ensures that the
+    * contents of the previous render or clear hit the render target before we
+    * resolve and the second likely ensures that the resolve is complete before
+    * we do any more rendering or clearing.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after transition RT");
+
+   for (uint32_t l = 0; l < level_count; l++) {
+      uint32_t level = base_level + l;
+
+      uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
+      if (base_layer >= aux_layers)
+         break; /* We will only get fewer layers as level increases */
+      uint32_t level_layer_count =
+         MIN2(layer_count, aux_layers - base_layer);
+
+      for (uint32_t a = 0; a < level_layer_count; a++) {
+         uint32_t array_layer = base_layer + a;
+
+         /* If will_full_fast_clear is set, the caller promises to fast-clear
+          * the largest portion of the specified range as it can.  For color
+          * images, that means only the first LOD and array slice.
+          */
+         if (level == 0 && array_layer == 0 && will_full_fast_clear)
+            continue;
+
+         if (image->vk.samples == 1) {
+            anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
+                                           image->planes[plane].primary_surface.isl.format,
+                                           ISL_SWIZZLE_IDENTITY,
+                                           aspect, level, array_layer, resolve_op,
+                                           final_fast_clear);
+         } else {
+            /* We only support fast-clear on the first layer so partial
+             * resolves should not be used on other layers as they will use
+             * the clear color stored in memory that is only valid for layer0.
+             */
+            if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
+                array_layer != 0)
+               continue;
+
+            anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
+                                           image->planes[plane].primary_surface.isl.format,
+                                           ISL_SWIZZLE_IDENTITY,
+                                           aspect, array_layer, resolve_op,
+                                           final_fast_clear);
+         }
+      }
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after transition RT");
+}
+
+static MUST_CHECK VkResult
+anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
+                                uint32_t color_att_count)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   /* Reserve one for the NULL state. */
+   unsigned num_states = 1 + color_att_count;
+   const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+   const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
+   gfx->att_states =
+      anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
+                             num_states * ss_stride, isl_dev->ss.align);
+   if (gfx->att_states.map == NULL) {
+      return anv_batch_set_error(&cmd_buffer->batch,
+                                 VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   struct anv_state next_state = gfx->att_states;
+   next_state.alloc_size = isl_dev->ss.size;
+
+   gfx->null_surface_state = next_state;
+   next_state.offset += ss_stride;
+   next_state.map += ss_stride;
+
+   gfx->color_att_count = color_att_count;
+   for (uint32_t i = 0; i < color_att_count; i++) {
+      gfx->color_att[i] = (struct anv_attachment) {
+         .surface_state.state = next_state,
+      };
+      next_state.offset += ss_stride;
+      next_state.map += ss_stride;
+   }
+   gfx->depth_att = (struct anv_attachment) { };
+   gfx->stencil_att = (struct anv_attachment) { };
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   gfx->render_area = (VkRect2D) { };
+   gfx->layer_count = 0;
+   gfx->samples = 0;
+
+   gfx->color_att_count = 0;
+   gfx->depth_att = (struct anv_attachment) { };
+   gfx->stencil_att = (struct anv_attachment) { };
+   gfx->null_surface_state = ANV_STATE_NULL;
+}
+
+VkResult
+genX(BeginCommandBuffer)(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VkResult result;
+
+   /* If this is the first vkBeginCommandBuffer, we must *initialize* the
+    * command buffer's state. Otherwise, we must *reset* its state. In both
+    * cases we reset it.
+    *
+    * From the Vulkan 1.0 spec:
+    *
+    *    If a command buffer is in the executable state and the command buffer
+    *    was allocated from a command pool with the
+    *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
+    *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
+    *    as if vkResetCommandBuffer had been called with
+    *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
+    *    the command buffer in the recording state.
+    */
+   anv_cmd_buffer_reset(cmd_buffer);
+   anv_cmd_buffer_reset_rendering(cmd_buffer);
+
+   cmd_buffer->usage_flags = pBeginInfo->flags;
+
+   /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
+    * primary level command buffers.
+    *
+    * From the Vulkan 1.0 spec:
+    *
+    *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
+    *    secondary command buffer is considered to be entirely inside a render
+    *    pass. If this is a primary command buffer, then this bit is ignored.
+    */
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+      cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
+
+   trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
+
+   genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+
+   /* We sometimes store vertex data in the dynamic state buffer for blorp
+    * operations and our dynamic state stream may re-use data from previous
+    * command buffers.  In order to prevent stale cache data, we flush the VF
+    * cache.  We could do this on every blorp call but that's not really
+    * needed as all of the data will get written by the CPU prior to the GPU
+    * executing anything.  The chances are fairly high that they will use
+    * blorp at least once per primary command buffer so it shouldn't be
+    * wasted.
+    *
+    * There is also a workaround on gfx8 which requires us to invalidate the
+    * VF cache occasionally.  It's easier if we can assume we start with a
+    * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                             "new cmd buffer");
+
+   /* Re-emit the aux table register in every command buffer.  This way we're
+    * ensured that we have the table even if this command buffer doesn't
+    * initialize any images.
+    */
+   if (cmd_buffer->device->info->has_aux_map) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                                "new cmd buffer with aux-tt");
+   }
+
+   /* We send an "Indirect State Pointers Disable" packet at
+    * EndCommandBuffer, so all push constant packets are ignored during a
+    * context restore. Documentation says after that command, we need to
+    * emit push constants again before any rendering operation. So we
+    * flag them dirty here to make sure they get emitted.
+    */
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+
+   if (cmd_buffer->usage_flags &
+       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+      struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+      char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
+      const VkRenderingInfo *resume_info =
+         vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
+                                                               pBeginInfo,
+                                                               gcbiar_data);
+      if (resume_info != NULL) {
+         genX(CmdBeginRendering)(commandBuffer, resume_info);
+      } else {
+         const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
+            vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
+                                                             pBeginInfo);
+         assert(inheritance_info);
+
+         gfx->rendering_flags = inheritance_info->flags;
+         gfx->render_area = (VkRect2D) { };
+         gfx->layer_count = 0;
+         gfx->samples = inheritance_info->rasterizationSamples;
+         gfx->view_mask = inheritance_info->viewMask;
+
+         uint32_t color_att_count = inheritance_info->colorAttachmentCount;
+         result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+         if (result != VK_SUCCESS)
+            return result;
+
+         for (uint32_t i = 0; i < color_att_count; i++) {
+            gfx->color_att[i].vk_format =
+               inheritance_info->pColorAttachmentFormats[i];
+         }
+         gfx->depth_att.vk_format =
+            inheritance_info->depthAttachmentFormat;
+         gfx->stencil_att.vk_format =
+            inheritance_info->stencilAttachmentFormat;
+
+         cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
+      }
+   }
+
+#if GFX_VER >= 8
+   /* Emit the sample pattern at the beginning of the batch because the
+    * default locations emitted at the device initialization might have been
+    * changed by a previous command buffer.
+    *
+    * Do not change that when we're continuing a previous renderpass.
+    */
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
+       !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
+      genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
+#endif
+
+#if GFX_VERx10 >= 75
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
+         vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
+
+      /* If secondary buffer supports conditional rendering
+       * we should emit commands as if conditional rendering is enabled.
+       */
+      cmd_buffer->state.conditional_render_enabled =
+         conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
+   }
+#endif
+
+   return VK_SUCCESS;
+}
+
+/* From the PRM, Volume 2a:
+ *
+ *    "Indirect State Pointers Disable
+ *
+ *    At the completion of the post-sync operation associated with this pipe
+ *    control packet, the indirect state pointers in the hardware are
+ *    considered invalid; the indirect pointers are not saved in the context.
+ *    If any new indirect state commands are executed in the command stream
+ *    while the pipe control is pending, the new indirect state commands are
+ *    preserved.
+ *
+ *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
+ *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
+ *    commands are only considered as Indirect State Pointers. Once ISP is
+ *    issued in a context, SW must initialize by programming push constant
+ *    commands for all the shaders (at least to zero length) before attempting
+ *    any rendering operation for the same context."
+ *
+ * 3DSTATE_CONSTANT_* packets are restored during a context restore,
+ * even though they point to a BO that has been already unreferenced at
+ * the end of the previous batch buffer. This has been fine so far since
+ * we are protected by these scratch page (every address not covered by
+ * a BO should be pointing to the scratch page). But on CNL, it is
+ * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
+ * instruction.
+ *
+ * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
+ * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
+ * context restore, so the mentioned hang doesn't happen. However,
+ * software must program push constant commands for all stages prior to
+ * rendering anything. So we flag them dirty in BeginCommandBuffer.
+ *
+ * Finally, we also make sure to stall at pixel scoreboard to make sure the
+ * constants have been loaded into the EUs prior to disable the push constants
+ * so that it doesn't hang a previous 3DPRIMITIVE.
+ */
+static void
+emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.StallAtPixelScoreboard = true;
+         pc.CommandStreamerStallEnable = true;
+         anv_debug_dump_pc(pc);
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.IndirectStatePointersDisable = true;
+         pc.CommandStreamerStallEnable = true;
+         anv_debug_dump_pc(pc);
+   }
+}
+
+VkResult
+genX(EndCommandBuffer)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return cmd_buffer->batch.status;
+
+   anv_measure_endcommandbuffer(cmd_buffer);
+
+   /* We want every command buffer to start with the PMA fix in a known state,
+    * so we disable it at the end of the command buffer.
+    */
+   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   emit_isp_disable(cmd_buffer);
+
+   trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
+
+   anv_cmd_buffer_end_batch_buffer(cmd_buffer);
+
+   return VK_SUCCESS;
+}
+
+void
+genX(CmdExecuteCommands)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCmdBuffers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
+
+   assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+
+   if (anv_batch_has_error(&primary->batch))
+      return;
+
+   /* The secondary command buffers will assume that the PMA fix is disabled
+    * when they begin executing.  Make sure this is true.
+    */
+   genX(cmd_buffer_enable_pma_fix)(primary, false);
+
+   /* The secondary command buffer doesn't know which textures etc. have been
+    * flushed prior to their execution.  Apply those flushes now.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(primary);
+
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
+
+      assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      assert(!anv_batch_has_error(&secondary->batch));
+
+#if GFX_VERx10 >= 75
+      if (secondary->state.conditional_render_enabled) {
+         if (!primary->state.conditional_render_enabled) {
+            /* Secondary buffer is constructed as if it will be executed
+             * with conditional rendering, we should satisfy this dependency
+             * regardless of conditional rendering being enabled in primary.
+             */
+            struct mi_builder b;
+            mi_builder_init(&b, primary->device->info, &primary->batch);
+            mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
+                         mi_imm(UINT64_MAX));
+         }
+      }
+#endif
+
+      if (secondary->usage_flags &
+          VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+         /* If we're continuing a render pass from the primary, we need to
+          * copy the surface states for the current subpass into the storage
+          * we allocated for them in BeginCommandBuffer.
+          */
+         struct anv_bo *ss_bo =
+            primary->device->surface_state_pool.block_pool.bo;
+         struct anv_state src_state = primary->state.gfx.att_states;
+         struct anv_state dst_state = secondary->state.gfx.att_states;
+         assert(src_state.alloc_size == dst_state.alloc_size);
+
+         genX(cmd_buffer_so_memcpy)(primary,
+                                    (struct anv_address) {
+                                       .bo = ss_bo,
+                                       .offset = dst_state.offset,
+                                    },
+                                    (struct anv_address) {
+                                       .bo = ss_bo,
+                                       .offset = src_state.offset,
+                                    },
+                                    src_state.alloc_size);
+      }
+
+      anv_cmd_buffer_add_secondary(primary, secondary);
+
+      assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
+             secondary->perf_query_pool == primary->perf_query_pool);
+      if (secondary->perf_query_pool)
+         primary->perf_query_pool = secondary->perf_query_pool;
+
+#if GFX_VERx10 == 120
+      if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
+         primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
+#endif
+   }
+
+   /* The secondary isn't counted in our VF cache tracking so we need to
+    * invalidate the whole thing.
+    */
+   if (GFX_VER >= 8 && GFX_VER <= 9) {
+      anv_add_pending_pipe_bits(primary,
+                                ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                                "Secondary cmd buffer not tracked in VF cache");
+   }
+
+   /* The secondary may have selected a different pipeline (3D or compute) and
+    * may have changed the current L3$ configuration.  Reset our tracking
+    * variables to invalid values to ensure that we re-emit these in the case
+    * where we do any draws or compute dispatches from the primary after the
+    * secondary has returned.
+    */
+   primary->state.current_pipeline = UINT32_MAX;
+   primary->state.current_l3_config = NULL;
+   primary->state.current_hash_scale = 0;
+   primary->state.gfx.push_constant_stages = 0;
+   vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
+
+   /* Each of the secondary command buffers will use its own state base
+    * address.  We need to re-emit state base address for the primary after
+    * all of the secondaries are done.
+    *
+    * TODO: Maybe we want to make this a dirty bit to avoid extra state base
+    * address calls?
+    */
+   genX(cmd_buffer_emit_state_base_address)(primary);
+}
+
+/**
+ * Program the hardware to use the specified L3 configuration.
+ */
+void
+genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
+                           const struct intel_l3_config *cfg)
+{
+   assert(cfg || GFX_VER >= 12);
+   if (cfg == cmd_buffer->state.current_l3_config)
+      return;
+
+#if GFX_VER >= 11
+   /* On Gfx11+ we use only one config, so verify it remains the same and skip
+    * the stalling programming entirely.
+    */
+   assert(cfg == cmd_buffer->device->l3_config);
+#else
+   if (INTEL_DEBUG(DEBUG_L3)) {
+      mesa_logd("L3 config transition: ");
+      intel_dump_l3_config(cfg, stderr);
+   }
+
+   /* According to the hardware docs, the L3 partitioning can only be changed
+    * while the pipeline is completely drained and the caches are flushed,
+    * which involves a first PIPE_CONTROL flush which stalls the pipeline...
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DCFlushEnable = true;
+      pc.PostSyncOperation = NoWrite;
+      pc.CommandStreamerStallEnable = true;
+      anv_debug_dump_pc(pc);
+   }
+
+   /* ...followed by a second pipelined PIPE_CONTROL that initiates
+    * invalidation of the relevant caches.  Note that because RO invalidation
+    * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
+    * command is processed by the CS) we cannot combine it with the previous
+    * stalling flush as the hardware documentation suggests, because that
+    * would cause the CS to stall on previous rendering *after* RO
+    * invalidation and wouldn't prevent the RO caches from being polluted by
+    * concurrent rendering before the stall completes.  This intentionally
+    * doesn't implement the SKL+ hardware workaround suggesting to enable CS
+    * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
+    * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
+    * already guarantee that there is no concurrent GPGPU kernel execution
+    * (see SKL HSD 2132585).
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.TextureCacheInvalidationEnable = true;
+      pc.ConstantCacheInvalidationEnable = true;
+      pc.InstructionCacheInvalidateEnable = true;
+      pc.StateCacheInvalidationEnable = true;
+      pc.PostSyncOperation = NoWrite;
+      anv_debug_dump_pc(pc);
+   }
+
+   /* Now send a third stalling flush to make sure that invalidation is
+    * complete when the L3 configuration registers are modified.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DCFlushEnable = true;
+      pc.PostSyncOperation = NoWrite;
+      pc.CommandStreamerStallEnable = true;
+      anv_debug_dump_pc(pc);
+   }
+
+   genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
+#endif /* GFX_VER >= 11 */
+   cmd_buffer->state.current_l3_config = cfg;
+}
+
+enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits)
+{
+   /*
+    * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
+    *
+    *    Write synchronization is a special case of end-of-pipe
+    *    synchronization that requires that the render cache and/or depth
+    *    related caches are flushed to memory, where the data will become
+    *    globally visible. This type of synchronization is required prior to
+    *    SW (CPU) actually reading the result data from memory, or initiating
+    *    an operation that will use as a read surface (such as a texture
+    *    surface) a previous render target and/or depth/stencil buffer
+    *
+    *
+    * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+    *
+    *    Exercising the write cache flush bits (Render Target Cache Flush
+    *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
+    *    ensures the write caches are flushed and doesn't guarantee the data
+    *    is globally visible.
+    *
+    *    SW can track the completion of the end-of-pipe-synchronization by
+    *    using "Notify Enable" and "PostSync Operation - Write Immediate
+    *    Data" in the PIPE_CONTROL command.
+    *
+    * In other words, flushes are pipelined while invalidations are handled
+    * immediately.  Therefore, if we're flushing anything then we need to
+    * schedule an end-of-pipe sync before any invalidations can happen.
+    */
+   if (bits & ANV_PIPE_FLUSH_BITS)
+      bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
+
+
+   /* HSD 1209978178: docs say that before programming the aux table:
+    *
+    *    "Driver must ensure that the engine is IDLE but ensure it doesn't
+    *    add extra flushes in the case it knows that the engine is already
+    *    IDLE."
+    */
+   if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
+      bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
+
+   /* If we're going to do an invalidate and we have a pending end-of-pipe
+    * sync that has yet to be resolved, we do the end-of-pipe sync now.
+    */
+   if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
+       (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
+      bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
+      bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
+   }
+
+   /* Project: SKL / Argument: LRI Post Sync Operation [23]
+    *
+    * "PIPECONTROL command with “Command Streamer Stall Enable” must be
+    *  programmed prior to programming a PIPECONTROL command with "LRI
+    *  Post Sync Operation" in GPGPU mode of operation (i.e when
+    *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
+    *
+    * The same text exists a few rows below for Post Sync Op.
+    */
+   if (bits & ANV_PIPE_POST_SYNC_BIT) {
+      if (GFX_VER == 9 && current_pipeline == GPGPU)
+         bits |= ANV_PIPE_CS_STALL_BIT;
+      bits &= ~ANV_PIPE_POST_SYNC_BIT;
+   }
+
+   if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
+               ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
+      anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+#if GFX_VERx10 >= 125
+         /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
+          *
+          *    "'HDC Pipeline Flush' bit must be set for this bit to take
+          *     effect."
+          *
+          * BSpec 47112: PIPE_CONTROL::HDC Pipeline Flush:
+          *
+          *    "When the "Pipeline Select" mode in PIPELINE_SELECT command is
+          *     set to "3D", HDC Pipeline Flush can also flush/invalidate the
+          *     LSC Untyped L1 cache based on the programming of HDC_Chicken0
+          *     register bits 13:11."
+          *
+          *    "When the 'Pipeline Select' mode is set to 'GPGPU', the LSC
+          *     Untyped L1 cache flush is controlled by 'Untyped Data-Port
+          *     Cache Flush' bit in the PIPE_CONTROL command."
+          *
+          *    As part of Wa_1608949956 & Wa_14010198302, i915 is programming
+          *    HDC_CHICKEN0[11:13] = 0 ("Untyped L1 is flushed, for both 3D
+          *    Pipecontrol Dataport flush, and UAV coherency barrier event").
+          *    So there is no need to set "Untyped Data-Port Cache" in 3D
+          *    mode.
+          */
+         pipe.UntypedDataPortCacheFlushEnable =
+            (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) &&
+            current_pipeline == GPGPU;
+         pipe.HDCPipelineFlushEnable |= pipe.UntypedDataPortCacheFlushEnable;
+#endif
+#if GFX_VER >= 12
+         pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+#else
+         /* Flushing HDC pipeline requires DC Flush on earlier HW. */
+         pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+#endif
+         pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         pipe.RenderTargetCacheFlushEnable =
+            bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+         /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
+          * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
+          */
+#if GFX_VER >= 12
+         pipe.DepthStallEnable =
+            pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
+#else
+         pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
+#endif
+
+#if GFX_VERx10 >= 125
+         pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
+#endif
+
+         pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
+#if GFX_VER == 8
+         /* From Broadwell PRM, volume 2a:
+          *    PIPE_CONTROL: Command Streamer Stall Enable:
+          *
+          *    "This bit must be always set when PIPE_CONTROL command is
+          *     programmed by GPGPU and MEDIA workloads, except for the cases
+          *     when only Read Only Cache Invalidation bits are set (State
+          *     Cache Invalidation Enable, Instruction cache Invalidation
+          *     Enable, Texture Cache Invalidation Enable, Constant Cache
+          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
+          *     need not implemented when FF_DOP_CG is disabled."
+          *
+          *    Since we do all the invalidation in the following PIPE_CONTROL,
+          *    if we got here, we need a stall.
+          */
+         pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
+#endif
+
+         pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+
+         /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+          *
+          *    "The most common action to perform upon reaching a
+          *    synchronization point is to write a value out to memory. An
+          *    immediate value (included with the synchronization command) may
+          *    be written."
+          *
+          *
+          * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+          *
+          *    "In case the data flushed out by the render engine is to be
+          *    read back in to the render engine in coherent manner, then the
+          *    render engine has to wait for the fence completion before
+          *    accessing the flushed data. This can be achieved by following
+          *    means on various products: PIPE_CONTROL command with CS Stall
+          *    and the required write caches flushed with Post-Sync-Operation
+          *    as Write Immediate Data.
+          *
+          *    Example:
+          *       - Workload-1 (3D/GPGPU/MEDIA)
+          *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
+          *         Immediate Data, Required Write Cache Flush bits set)
+          *       - Workload-2 (Can use the data produce or output by
+          *         Workload-1)
+          */
+         if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
+            pipe.CommandStreamerStallEnable = true;
+            pipe.PostSyncOperation = WriteImmediateData;
+            pipe.Address = device->workaround_address;
+         }
+
+         /*
+          * According to the Broadwell documentation, any PIPE_CONTROL with the
+          * "Command Streamer Stall" bit set must also have another bit set,
+          * with five different options:
+          *
+          *  - Render Target Cache Flush
+          *  - Depth Cache Flush
+          *  - Stall at Pixel Scoreboard
+          *  - Post-Sync Operation
+          *  - Depth Stall
+          *  - DC Flush Enable
+          *
+          * I chose "Stall at Pixel Scoreboard" since that's what we use in
+          * mesa and it seems to work fine. The choice is fairly arbitrary.
+          */
+         if (pipe.CommandStreamerStallEnable &&
+             !pipe.RenderTargetCacheFlushEnable &&
+             !pipe.DepthCacheFlushEnable &&
+             !pipe.StallAtPixelScoreboard &&
+             !pipe.PostSyncOperation &&
+             !pipe.DepthStallEnable &&
+             !pipe.DCFlushEnable)
+            pipe.StallAtPixelScoreboard = true;
+         anv_debug_dump_pc(pipe);
+      }
+
+      /* If a render target flush was emitted, then we can toggle off the bit
+       * saying that render target writes are ongoing.
+       */
+      if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+         bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
+
+      if (GFX_VERx10 == 75) {
+         /* Haswell needs addition work-arounds:
+          *
+          * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+          *
+          *    Option 1:
+          *    PIPE_CONTROL command with the CS Stall and the required write
+          *    caches flushed with Post-SyncOperation as Write Immediate Data
+          *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
+          *    spce) commands.
+          *
+          *    Example:
+          *       - Workload-1
+          *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
+          *         Immediate Data, Required Write Cache Flush bits set)
+          *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
+          *       - Workload-2 (Can use the data produce or output by
+          *         Workload-1)
+          *
+          * Unfortunately, both the PRMs and the internal docs are a bit
+          * out-of-date in this regard.  What the windows driver does (and
+          * this appears to actually work) is to emit a register read from the
+          * memory address written by the pipe control above.
+          *
+          * What register we load into doesn't matter.  We choose an indirect
+          * rendering register because we know it always exists and it's one
+          * of the first registers the command parser allows us to write.  If
+          * you don't have command parser support in your kernel (pre-4.2),
+          * this will get turned into MI_NOOP and you won't get the
+          * workaround.  Unfortunately, there's just not much we can do in
+          * that case.  This register is perfectly safe to write since we
+          * always re-load all of the indirect draw registers right before
+          * 3DPRIMITIVE when needed anyway.
+          */
+         anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
+            lrm.MemoryAddress = device->workaround_address;
+         }
+      }
+
+      bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
+                ANV_PIPE_END_OF_PIPE_SYNC_BIT);
+   }
+
+   if (bits & ANV_PIPE_INVALIDATE_BITS) {
+      /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+       *
+       *    "If the VF Cache Invalidation Enable is set to a 1 in a
+       *    PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields sets to
+       *    0, with the VF Cache Invalidation Enable set to 0 needs to be sent
+       *    prior to the PIPE_CONTROL with VF Cache Invalidation Enable set to
+       *    a 1."
+       *
+       * This appears to hang Broadwell, so we restrict it to just gfx9.
+       */
+      if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
+         anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
+
+      anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.StateCacheInvalidationEnable =
+            bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+         pipe.ConstantCacheInvalidationEnable =
+            bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+#if GFX_VER >= 12
+         /* Invalidates the L3 cache part in which index & vertex data is loaded
+          * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
+          */
+         pipe.L3ReadOnlyCacheInvalidationEnable =
+            bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+#endif
+         pipe.VFCacheInvalidationEnable =
+            bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         pipe.TextureCacheInvalidationEnable =
+            bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         pipe.InstructionCacheInvalidateEnable =
+            bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
+
+         /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+          *
+          *    "When VF Cache Invalidate is set “Post Sync Operation” must be
+          *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
+          *    “Write Timestamp”.
+          */
+         if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
+            pipe.PostSyncOperation = WriteImmediateData;
+            pipe.Address = device->workaround_address;
+         }
+         anv_debug_dump_pc(pipe);
+      }
+
+#if GFX_VER == 12
+      if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
+         anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+            lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
+            lri.DataDWord = 1;
+         }
+      }
+#endif
+
+      bits &= ~ANV_PIPE_INVALIDATE_BITS;
+   }
+
+   return bits;
+}
+
+void
+genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
+{
+   enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
+
+   if (unlikely(cmd_buffer->device->physical->always_flush_cache))
+      bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
+   else if (bits == 0)
+      return;
+
+   bool trace_flush =
+      (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
+   if (trace_flush)
+      trace_intel_begin_stall(&cmd_buffer->trace);
+
+   if ((GFX_VER >= 8 && GFX_VER <= 9) &&
+       (bits & ANV_PIPE_CS_STALL_BIT) &&
+       (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+      /* If we are doing a VF cache invalidate AND a CS stall (it must be
+       * both) then we can reset our vertex cache tracking.
+       */
+      memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+             sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+      memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+             sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+   }
+
+   cmd_buffer->state.pending_pipe_bits =
+      genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
+                                    cmd_buffer->device,
+                                    cmd_buffer->state.current_pipeline,
+                                    bits);
+
+   if (trace_flush) {
+      trace_intel_end_stall(&cmd_buffer->trace, bits,
+                            anv_pipe_flush_bit_to_ds_stall_flag, NULL);
+   }
+}
+
+static void
+cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
+                   const VkDependencyInfo *dep_info,
+                   const char *reason)
+{
+   /* XXX: Right now, we're really dumb and just flush whatever categories
+    * the app asks for.  One of these days we may make this a bit better
+    * but right now that's all the hardware allows for in most areas.
+    */
+   VkAccessFlags2 src_flags = 0;
+   VkAccessFlags2 dst_flags = 0;
+
+   for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+      src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
+      dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
+   }
+
+   for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+      src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
+      dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
+   }
+
+   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+      const VkImageMemoryBarrier2 *img_barrier =
+         &dep_info->pImageMemoryBarriers[i];
+
+      src_flags |= img_barrier->srcAccessMask;
+      dst_flags |= img_barrier->dstAccessMask;
+
+      ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+      const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
+
+      uint32_t base_layer, layer_count;
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         base_layer = 0;
+         layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
+      } else {
+         base_layer = range->baseArrayLayer;
+         layer_count = vk_image_subresource_layer_count(&image->vk, range);
+      }
+      const uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, range);
+
+      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         transition_depth_buffer(cmd_buffer, image,
+                                 base_layer, layer_count,
+                                 img_barrier->oldLayout,
+                                 img_barrier->newLayout,
+                                 false /* will_full_fast_clear */);
+      }
+
+      if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         transition_stencil_buffer(cmd_buffer, image,
+                                   range->baseMipLevel, level_count,
+                                   base_layer, layer_count,
+                                   img_barrier->oldLayout,
+                                   img_barrier->newLayout,
+                                   false /* will_full_fast_clear */);
+      }
+
+      if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
+         VkImageAspectFlags color_aspects =
+            vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+         anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
+            transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
+                                    range->baseMipLevel, level_count,
+                                    base_layer, layer_count,
+                                    img_barrier->oldLayout,
+                                    img_barrier->newLayout,
+                                    img_barrier->srcQueueFamilyIndex,
+                                    img_barrier->dstQueueFamilyIndex,
+                                    false /* will_full_fast_clear */);
+         }
+      }
+   }
+
+   enum anv_pipe_bits bits =
+      anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
+      anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
+
+   anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
+}
+
+void genX(CmdPipelineBarrier2)(
+    VkCommandBuffer                             commandBuffer,
+    const VkDependencyInfo*                     pDependencyInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
+}
+
+static void
+cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   VkShaderStageFlags stages =
+      cmd_buffer->state.gfx.pipeline->active_stages;
+
+   /* In order to avoid thrash, we assume that vertex and fragment stages
+    * always exist.  In the rare case where one is missing *and* the other
+    * uses push concstants, this may be suboptimal.  However, avoiding stalls
+    * seems more important.
+    */
+   stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
+      stages |= VK_SHADER_STAGE_VERTEX_BIT;
+
+   if (stages == cmd_buffer->state.gfx.push_constant_stages)
+      return;
+
+   const unsigned push_constant_kb =
+      cmd_buffer->device->info->max_constant_urb_size_kb;
+
+   const unsigned num_stages =
+      util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
+   unsigned size_per_stage = push_constant_kb / num_stages;
+
+   /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
+    * units of 2KB.  Incidentally, these are the same platforms that have
+    * 32KB worth of push constant space.
+    */
+   if (push_constant_kb == 32)
+      size_per_stage &= ~1u;
+
+   uint32_t kb_used = 0;
+   for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
+      unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+         alloc._3DCommandSubOpcode  = 18 + i;
+         alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
+         alloc.ConstantBufferSize   = push_size;
+      }
+      kb_used += push_size;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+      alloc.ConstantBufferOffset = kb_used;
+      alloc.ConstantBufferSize = push_constant_kb - kb_used;
+   }
+
+#if GFX_VERx10 == 125
+   /* Wa_22011440098
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    */
+   if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+         c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+      }
+   }
+#endif
+
+   cmd_buffer->state.gfx.push_constant_stages = stages;
+
+   /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
+    *
+    *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
+    *    the next 3DPRIMITIVE command after programming the
+    *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
+    *
+    * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
+    * pipeline setup, we need to dirty push constants.
+    */
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+}
+
+static VkResult
+emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
+                   struct anv_cmd_pipeline_state *pipe_state,
+                   struct anv_shader_bin *shader,
+                   struct anv_state *bt_state)
+{
+   uint32_t state_offset;
+
+   struct anv_pipeline_bind_map *map = &shader->bind_map;
+   if (map->surface_count == 0) {
+      *bt_state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
+                                                  map->surface_count,
+                                                  &state_offset);
+   uint32_t *bt_map = bt_state->map;
+
+   if (bt_state->map == NULL)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   /* We only need to emit relocs if we're not using softpin.  If we are using
+    * softpin then we always keep all user-allocated memory objects resident.
+    */
+   const bool need_client_mem_relocs =
+      anv_use_relocations(cmd_buffer->device->physical);
+   struct anv_push_constants *push = &pipe_state->push_constants;
+
+   for (uint32_t s = 0; s < map->surface_count; s++) {
+      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
+
+      struct anv_state surface_state;
+
+      switch (binding->set) {
+      case ANV_DESCRIPTOR_SET_NULL:
+         bt_map[s] = 0;
+         break;
+
+      case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
+         /* Color attachment binding */
+         assert(shader->stage == MESA_SHADER_FRAGMENT);
+         if (binding->index < cmd_buffer->state.gfx.color_att_count) {
+            const struct anv_attachment *att =
+               &cmd_buffer->state.gfx.color_att[binding->index];
+            surface_state = att->surface_state.state;
+         } else {
+            surface_state = cmd_buffer->state.gfx.null_surface_state;
+         }
+         assert(surface_state.map);
+         bt_map[s] = surface_state.offset + state_offset;
+         break;
+
+      case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
+         struct anv_state surface_state =
+            anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+
+         struct anv_address constant_data = {
+            .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
+            .offset = shader->kernel.offset +
+                      shader->prog_data->const_data_offset,
+         };
+         unsigned constant_data_size = shader->prog_data->const_data_size;
+
+         const enum isl_format format =
+            anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                               VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+         anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
+                                       format, ISL_SWIZZLE_IDENTITY,
+                                       ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                       constant_data, constant_data_size, 1);
+
+         assert(surface_state.map);
+         bt_map[s] = surface_state.offset + state_offset;
+         add_surface_reloc(cmd_buffer, surface_state, constant_data);
+         break;
+      }
+
+      case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
+         /* This is always the first binding for compute shaders */
+         assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
+
+         struct anv_state surface_state =
+            anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+
+         const enum isl_format format =
+            anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                               VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
+         anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
+                                       format, ISL_SWIZZLE_IDENTITY,
+                                       ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                       cmd_buffer->state.compute.num_workgroups,
+                                       12, 1);
+
+         assert(surface_state.map);
+         bt_map[s] = surface_state.offset + state_offset;
+         if (need_client_mem_relocs) {
+            add_surface_reloc(cmd_buffer, surface_state,
+                              cmd_buffer->state.compute.num_workgroups);
+         }
+         break;
+      }
+
+      case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+         /* This is a descriptor set buffer so the set index is actually
+          * given by binding->binding.  (Yes, that's confusing.)
+          */
+         struct anv_descriptor_set *set =
+            pipe_state->descriptors[binding->index];
+         assert(set->desc_mem.alloc_size);
+         assert(set->desc_surface_state.alloc_size);
+         bt_map[s] = set->desc_surface_state.offset + state_offset;
+         add_surface_reloc(cmd_buffer, set->desc_surface_state,
+                           anv_descriptor_set_address(set));
+         break;
+      }
+
+      default: {
+         assert(binding->set < MAX_SETS);
+         const struct anv_descriptor_set *set =
+            pipe_state->descriptors[binding->set];
+         if (binding->index >= set->descriptor_count) {
+            /* From the Vulkan spec section entitled "DescriptorSet and
+             * Binding Assignment":
+             *
+             *    "If the array is runtime-sized, then array elements greater
+             *    than or equal to the size of that binding in the bound
+             *    descriptor set must not be used."
+             *
+             * Unfortunately, the compiler isn't smart enough to figure out
+             * when a dynamic binding isn't used so it may grab the whole
+             * array and stick it in the binding table.  In this case, it's
+             * safe to just skip those bindings that are OOB.
+             */
+            assert(binding->index < set->layout->descriptor_count);
+            continue;
+         }
+         const struct anv_descriptor *desc = &set->descriptors[binding->index];
+
+         switch (desc->type) {
+         case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+         case VK_DESCRIPTOR_TYPE_SAMPLER:
+            /* Nothing for us to do here */
+            continue;
+
+         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+            if (desc->image_view) {
+               struct anv_surface_state sstate =
+                  (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
+                  desc->image_view->planes[binding->plane].general_sampler_surface_state :
+                  desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
+               surface_state = sstate.state;
+               assert(surface_state.alloc_size);
+               if (need_client_mem_relocs)
+                  add_surface_state_relocs(cmd_buffer, sstate);
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
+            if (desc->image_view) {
+               struct anv_surface_state sstate =
+                  binding->lowered_storage_surface
+                  ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
+                  : desc->image_view->planes[binding->plane].storage_surface_state;
+               surface_state = sstate.state;
+               assert(surface_state.alloc_size);
+               if (surface_state.offset == 0) {
+                  mesa_loge("Bound a image to a descriptor where the "
+                            "descriptor does not have NonReadable "
+                            "set and the image does not have a "
+                            "corresponding SPIR-V format enum.");
+                  vk_debug_report(&cmd_buffer->device->physical->instance->vk,
+                                  VK_DEBUG_REPORT_ERROR_BIT_EXT,
+                                  &desc->image_view->vk.base,
+                                  __LINE__, 0, "anv",
+                                  "Bound a image to a descriptor where the "
+                                  "descriptor does not have NonReadable "
+                                  "set and the image does not have a "
+                                  "corresponding SPIR-V format enum.");
+               }
+               if (surface_state.offset && need_client_mem_relocs)
+                  add_surface_state_relocs(cmd_buffer, sstate);
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+            if (desc->set_buffer_view) {
+               surface_state = desc->set_buffer_view->surface_state;
+               assert(surface_state.alloc_size);
+               if (need_client_mem_relocs) {
+                  add_surface_reloc(cmd_buffer, surface_state,
+                                    desc->set_buffer_view->address);
+               }
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+
+         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+            if (desc->buffer_view) {
+               surface_state = desc->buffer_view->surface_state;
+               assert(surface_state.alloc_size);
+               if (need_client_mem_relocs) {
+                  add_surface_reloc(cmd_buffer, surface_state,
+                                    desc->buffer_view->address);
+               }
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+            if (desc->buffer) {
+               /* Compute the offset within the buffer */
+               uint32_t dynamic_offset =
+                  push->dynamic_offsets[binding->dynamic_offset_index];
+               uint64_t offset = desc->offset + dynamic_offset;
+               /* Clamp to the buffer size */
+               offset = MIN2(offset, desc->buffer->vk.size);
+               /* Clamp the range to the buffer size */
+               uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+               /* Align the range for consistency */
+               if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+                  range = align_u32(range, ANV_UBO_ALIGNMENT);
+
+               struct anv_address address =
+                  anv_address_add(desc->buffer->address, offset);
+
+               surface_state =
+                  anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
+               enum isl_format format =
+                  anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                                     desc->type);
+
+               isl_surf_usage_flags_t usage =
+                  desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
+                  ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
+                  ISL_SURF_USAGE_STORAGE_BIT;
+
+               anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
+                                             format, ISL_SWIZZLE_IDENTITY,
+                                             usage, address, range, 1);
+               if (need_client_mem_relocs)
+                  add_surface_reloc(cmd_buffer, surface_state, address);
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+         }
+
+         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+            if (desc->buffer_view) {
+               surface_state = binding->lowered_storage_surface
+                  ? desc->buffer_view->lowered_storage_surface_state
+                  : desc->buffer_view->storage_surface_state;
+               assert(surface_state.alloc_size);
+               if (need_client_mem_relocs) {
+                  add_surface_reloc(cmd_buffer, surface_state,
+                                    desc->buffer_view->address);
+               }
+            } else {
+               surface_state = cmd_buffer->device->null_surface_state;
+            }
+            break;
+
+         default:
+            assert(!"Invalid descriptor type");
+            continue;
+         }
+         assert(surface_state.map);
+         bt_map[s] = surface_state.offset + state_offset;
+         break;
+      }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+emit_samplers(struct anv_cmd_buffer *cmd_buffer,
+              struct anv_cmd_pipeline_state *pipe_state,
+              struct anv_shader_bin *shader,
+              struct anv_state *state)
+{
+   struct anv_pipeline_bind_map *map = &shader->bind_map;
+   if (map->sampler_count == 0) {
+      *state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   uint32_t size = map->sampler_count * 16;
+   *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
+
+   if (state->map == NULL)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   for (uint32_t s = 0; s < map->sampler_count; s++) {
+      struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
+      const struct anv_descriptor *desc =
+         &pipe_state->descriptors[binding->set]->descriptors[binding->index];
+
+      if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         continue;
+
+      struct anv_sampler *sampler = desc->sampler;
+
+      /* This can happen if we have an unfilled slot since TYPE_SAMPLER
+       * happens to be zero.
+       */
+      if (sampler == NULL)
+         continue;
+
+      memcpy(state->map + (s * 16),
+             sampler->state[binding->plane], sizeof(sampler->state[0]));
+   }
+
+   return VK_SUCCESS;
+}
+
+static uint32_t
+flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
+                      struct anv_cmd_pipeline_state *pipe_state,
+                      const VkShaderStageFlags dirty,
+                      struct anv_shader_bin **shaders,
+                      uint32_t num_shaders)
+{
+   VkShaderStageFlags flushed = 0;
+
+   VkResult result = VK_SUCCESS;
+   for (uint32_t i = 0; i < num_shaders; i++) {
+      if (!shaders[i])
+         continue;
+
+      gl_shader_stage stage = shaders[i]->stage;
+      VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
+      if ((vk_stage & dirty) == 0)
+         continue;
+
+      assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
+      result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
+                             &cmd_buffer->state.samplers[stage]);
+      if (result != VK_SUCCESS)
+         break;
+
+      assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
+      result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
+                                  &cmd_buffer->state.binding_tables[stage]);
+      if (result != VK_SUCCESS)
+         break;
+
+      flushed |= vk_stage;
+   }
+
+   if (result != VK_SUCCESS) {
+      assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+      result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return 0;
+
+      /* Re-emit state base addresses so we get the new surface state base
+       * address before we start emitting binding tables etc.
+       */
+      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+
+      /* Re-emit all active binding tables */
+      flushed = 0;
+
+      for (uint32_t i = 0; i < num_shaders; i++) {
+         if (!shaders[i])
+            continue;
+
+         gl_shader_stage stage = shaders[i]->stage;
+
+         result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
+                                &cmd_buffer->state.samplers[stage]);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return 0;
+         }
+         result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
+                                     &cmd_buffer->state.binding_tables[stage]);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return 0;
+         }
+
+         flushed |= mesa_to_vk_shader_stage(stage);
+      }
+   }
+
+   return flushed;
+}
+
+static void
+cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
+                                    uint32_t stages)
+{
+   static const uint32_t sampler_state_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 43,
+      [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 46,
+      [MESA_SHADER_FRAGMENT]                    = 47,
+   };
+
+   static const uint32_t binding_table_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 38,
+      [MESA_SHADER_TESS_CTRL]                   = 39,
+      [MESA_SHADER_TESS_EVAL]                   = 40,
+      [MESA_SHADER_GEOMETRY]                    = 41,
+      [MESA_SHADER_FRAGMENT]                    = 42,
+   };
+
+   anv_foreach_stage(s, stages) {
+      assert(s < ARRAY_SIZE(binding_table_opcodes));
+
+      if (cmd_buffer->state.samplers[s].alloc_size > 0) {
+         anv_batch_emit(&cmd_buffer->batch,
+                        GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
+            ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
+            ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
+         }
+      }
+
+      /* Always emit binding table pointers if we're asked to, since on SKL
+       * this is what flushes push constants. */
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
+         btp._3DCommandSubOpcode = binding_table_opcodes[s];
+         btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+      }
+   }
+}
+
+static struct anv_address
+get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
+                       const struct anv_shader_bin *shader,
+                       const struct anv_push_range *range)
+{
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   switch (range->set) {
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+      /* This is a descriptor set buffer so the set index is
+       * actually given by binding->binding.  (Yes, that's
+       * confusing.)
+       */
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->index];
+      return anv_descriptor_set_address(set);
+   }
+
+   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
+      if (gfx_state->base.push_constants_state.alloc_size == 0) {
+         gfx_state->base.push_constants_state =
+            anv_cmd_buffer_gfx_push_constants(cmd_buffer);
+      }
+      return (struct anv_address) {
+         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         .offset = gfx_state->base.push_constants_state.offset,
+      };
+   }
+
+   case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
+      return (struct anv_address) {
+         .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
+         .offset = shader->kernel.offset +
+                   shader->prog_data->const_data_offset,
+      };
+
+   default: {
+      assert(range->set < MAX_SETS);
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->set];
+      const struct anv_descriptor *desc =
+         &set->descriptors[range->index];
+
+      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+         if (desc->buffer_view)
+            return desc->buffer_view->address;
+      } else {
+         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+         if (desc->buffer) {
+            const struct anv_push_constants *push =
+               &gfx_state->base.push_constants;
+            uint32_t dynamic_offset =
+               push->dynamic_offsets[range->dynamic_offset_index];
+            return anv_address_add(desc->buffer->address,
+                                   desc->offset + dynamic_offset);
+         }
+      }
+
+      /* For NULL UBOs, we just return an address in the workaround BO.  We do
+       * writes to it for workarounds but always at the bottom.  The higher
+       * bytes should be all zeros.
+       */
+      assert(range->length * 32 <= 2048);
+      return (struct anv_address) {
+         .bo = cmd_buffer->device->workaround_bo,
+         .offset = 1024,
+      };
+   }
+   }
+}
+
+
+/** Returns the size in bytes of the bound buffer
+ *
+ * The range is relative to the start of the buffer, not the start of the
+ * range.  The returned range may be smaller than
+ *
+ *    (range->start + range->length) * 32;
+ */
+static uint32_t
+get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
+                          const struct anv_shader_bin *shader,
+                          const struct anv_push_range *range)
+{
+   assert(shader->stage != MESA_SHADER_COMPUTE);
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   switch (range->set) {
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->index];
+      assert(range->start * 32 < set->desc_mem.alloc_size);
+      assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
+      return set->desc_mem.alloc_size;
+   }
+
+   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
+      return (range->start + range->length) * 32;
+
+   case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
+      return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
+
+   default: {
+      assert(range->set < MAX_SETS);
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->set];
+      const struct anv_descriptor *desc =
+         &set->descriptors[range->index];
+
+      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+         /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
+            * We use the descriptor set's internally allocated surface state to fill the binding table entry.
+         */
+         if (!desc->set_buffer_view)
+            return 0;
+
+         if (range->start * 32 > desc->set_buffer_view->range)
+            return 0;
+
+         return desc->set_buffer_view->range;
+      } else {
+         if (!desc->buffer)
+            return 0;
+
+         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+         /* Compute the offset within the buffer */
+         const struct anv_push_constants *push =
+            &gfx_state->base.push_constants;
+         uint32_t dynamic_offset =
+            push->dynamic_offsets[range->dynamic_offset_index];
+         uint64_t offset = desc->offset + dynamic_offset;
+         /* Clamp to the buffer size */
+         offset = MIN2(offset, desc->buffer->vk.size);
+         /* Clamp the range to the buffer size */
+         uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+         /* Align the range for consistency */
+         bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
+
+         return bound_range;
+      }
+   }
+   }
+}
+
+static void
+cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
+                              gl_shader_stage stage,
+                              struct anv_address *buffers,
+                              unsigned buffer_count)
+{
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
+
+   static const uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+   };
+
+   assert(stage < ARRAY_SIZE(push_constant_opcodes));
+
+   UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
+      c._3DCommandSubOpcode = push_constant_opcodes[stage];
+
+      /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
+       *
+       *    "Constant Buffer Object Control State must be always
+       *     programmed to zero."
+       *
+       * This restriction does not exist on any newer platforms.
+       *
+       * We only have one MOCS field for the whole packet, not one per
+       * buffer.  We could go out of our way here to walk over all of
+       * the buffers and see if any of them are used externally and use
+       * the external MOCS.  However, the notion that someone would use
+       * the same bit of memory for both scanout and a UBO is nuts.
+       *
+       * Let's not bother and assume it's all internal.
+       */
+#if GFX_VER >= 9
+      c.MOCS = mocs;
+#elif GFX_VER < 8
+      c.ConstantBody.MOCS = mocs;
+#endif
+
+      if (anv_pipeline_has_stage(pipeline, stage)) {
+         const struct anv_pipeline_bind_map *bind_map =
+            &pipeline->shaders[stage]->bind_map;
+
+#if GFX_VERx10 >= 75
+         /* The Skylake PRM contains the following restriction:
+          *
+          *    "The driver must ensure The following case does not occur
+          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+          *     buffer 3 read length equal to zero committed followed by a
+          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+          *     zero committed."
+          *
+          * To avoid this, we program the buffers in the highest slots.
+          * This way, slot 0 is only used if slot 3 is also used.
+          */
+         assert(buffer_count <= 4);
+         const unsigned shift = 4 - buffer_count;
+         for (unsigned i = 0; i < buffer_count; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+
+            /* At this point we only have non-empty ranges */
+            assert(range->length > 0);
+
+            /* For Ivy Bridge, make sure we only set the first range (actual
+             * push constants)
+             */
+            assert((GFX_VERx10 >= 75) || i == 0);
+
+            c.ConstantBody.ReadLength[i + shift] = range->length;
+            c.ConstantBody.Buffer[i + shift] =
+               anv_address_add(buffers[i], range->start * 32);
+         }
+#else
+         /* For Ivy Bridge, push constants are relative to dynamic state
+          * base address and we only ever push actual push constants.
+          */
+         if (bind_map->push_ranges[0].length > 0) {
+            assert(buffer_count == 1);
+            assert(bind_map->push_ranges[0].set ==
+                   ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
+            assert(buffers[0].bo ==
+                   cmd_buffer->device->dynamic_state_pool.block_pool.bo);
+            c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
+            c.ConstantBody.Buffer[0].bo = NULL;
+            c.ConstantBody.Buffer[0].offset = buffers[0].offset;
+         }
+         assert(bind_map->push_ranges[1].length == 0);
+         assert(bind_map->push_ranges[2].length == 0);
+         assert(bind_map->push_ranges[3].length == 0);
+#endif
+      }
+   }
+}
+
+#if GFX_VER >= 12
+static void
+cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t shader_mask,
+                                  struct anv_address *buffers,
+                                  uint32_t buffer_count)
+{
+   if (buffer_count == 0) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+         c.ShaderUpdateEnable = shader_mask;
+         c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
+      }
+      return;
+   }
+
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
+
+   static const UNUSED uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+   };
+
+   gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
+   assert(stage < ARRAY_SIZE(push_constant_opcodes));
+
+   const struct anv_pipeline_bind_map *bind_map =
+      &pipeline->shaders[stage]->bind_map;
+
+   uint32_t *dw;
+   const uint32_t buffer_mask = (1 << buffer_count) - 1;
+   const uint32_t num_dwords = 2 + 2 * buffer_count;
+
+   dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                        GENX(3DSTATE_CONSTANT_ALL),
+                        .ShaderUpdateEnable = shader_mask,
+                        .PointerBufferMask = buffer_mask,
+                        .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
+
+   for (int i = 0; i < buffer_count; i++) {
+      const struct anv_push_range *range = &bind_map->push_ranges[i];
+      GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+         &cmd_buffer->batch, dw + 2 + i * 2,
+         &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+            .PointerToConstantBuffer =
+               anv_address_add(buffers[i], range->start * 32),
+            .ConstantBufferReadLength = range->length,
+         });
+   }
+}
+#endif
+
+static void
+cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
+                                VkShaderStageFlags dirty_stages)
+{
+   VkShaderStageFlags flushed = 0;
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
+
+#if GFX_VER >= 12
+   uint32_t nobuffer_stages = 0;
+#endif
+
+   /* Compute robust pushed register access mask for each stage. */
+   if (cmd_buffer->device->robust_buffer_access) {
+      anv_foreach_stage(stage, dirty_stages) {
+         if (!anv_pipeline_has_stage(pipeline, stage))
+            continue;
+
+         const struct anv_shader_bin *shader = pipeline->shaders[stage];
+         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+         struct anv_push_constants *push = &gfx_state->base.push_constants;
+
+         push->push_reg_mask[stage] = 0;
+         /* Start of the current range in the shader, relative to the start of
+          * push constants in the shader.
+          */
+         unsigned range_start_reg = 0;
+         for (unsigned i = 0; i < 4; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+            if (range->length == 0)
+               continue;
+
+            unsigned bound_size =
+               get_push_range_bound_size(cmd_buffer, shader, range);
+            if (bound_size >= range->start * 32) {
+               unsigned bound_regs =
+                  MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
+                       range->length);
+               assert(range_start_reg + bound_regs <= 64);
+               push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
+                                                              bound_regs);
+            }
+
+            cmd_buffer->state.push_constants_dirty |=
+               mesa_to_vk_shader_stage(stage);
+
+            range_start_reg += range->length;
+         }
+      }
+   }
+
+   /* Resets the push constant state so that we allocate a new one if
+    * needed.
+    */
+   gfx_state->base.push_constants_state = ANV_STATE_NULL;
+
+   anv_foreach_stage(stage, dirty_stages) {
+      unsigned buffer_count = 0;
+      flushed |= mesa_to_vk_shader_stage(stage);
+      UNUSED uint32_t max_push_range = 0;
+
+      struct anv_address buffers[4] = {};
+      if (anv_pipeline_has_stage(pipeline, stage)) {
+         const struct anv_shader_bin *shader = pipeline->shaders[stage];
+         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+         /* We have to gather buffer addresses as a second step because the
+          * loop above puts data into the push constant area and the call to
+          * get_push_range_address is what locks our push constants and copies
+          * them into the actual GPU buffer.  If we did the two loops at the
+          * same time, we'd risk only having some of the sizes in the push
+          * constant buffer when we did the copy.
+          */
+         for (unsigned i = 0; i < 4; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+            if (range->length == 0)
+               break;
+
+            buffers[i] = get_push_range_address(cmd_buffer, shader, range);
+            max_push_range = MAX2(max_push_range, range->length);
+            buffer_count++;
+         }
+
+         /* We have at most 4 buffers but they should be tightly packed */
+         for (unsigned i = buffer_count; i < 4; i++)
+            assert(bind_map->push_ranges[i].length == 0);
+      }
+
+#if GFX_VER >= 12
+      /* If this stage doesn't have any push constants, emit it later in a
+       * single CONSTANT_ALL packet.
+       */
+      if (buffer_count == 0) {
+         nobuffer_stages |= 1 << stage;
+         continue;
+      }
+
+      /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
+       * contains only 5 bits, so we can only use it for buffers smaller than
+       * 32.
+       */
+      if (max_push_range < 32) {
+         cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
+                                           buffers, buffer_count);
+         continue;
+      }
+#endif
+
+      cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
+   }
+
+#if GFX_VER >= 12
+   if (nobuffer_stages)
+      cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
+#endif
+
+   cmd_buffer->state.push_constants_dirty &= ~flushed;
+}
+
+#if GFX_VERx10 >= 125
+static void
+cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
+                                  VkShaderStageFlags dirty_stages)
+{
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
+
+   if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_NV &&
+       anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+
+      const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_TASK];
+      const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
+         const struct anv_push_range *range = &bind_map->push_ranges[0];
+         if (range->length > 0) {
+            struct anv_address buffer =
+               get_push_range_address(cmd_buffer, shader, range);
+
+            uint64_t addr = anv_address_physical(buffer);
+            data.InlineData[0] = addr & 0xffffffff;
+            data.InlineData[1] = addr >> 32;
+
+            memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+                   cmd_buffer->state.gfx.base.push_constants.client_data,
+                   BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+         }
+      }
+   }
+
+   if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_NV &&
+       anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
+
+      const struct anv_shader_bin *shader = pipeline->shaders[MESA_SHADER_MESH];
+      const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
+         const struct anv_push_range *range = &bind_map->push_ranges[0];
+         if (range->length > 0) {
+            struct anv_address buffer =
+               get_push_range_address(cmd_buffer, shader, range);
+
+            uint64_t addr = anv_address_physical(buffer);
+            data.InlineData[0] = addr & 0xffffffff;
+            data.InlineData[1] = addr >> 32;
+
+            memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+                   cmd_buffer->state.gfx.base.push_constants.client_data,
+                   BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+         }
+      }
+   }
+
+   cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
+}
+#endif
+
+static void
+cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+   if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
+       !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
+#if GFX_VER <= 7
+       !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
+       !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
+#endif
+       !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
+      return;
+
+   /* Take dynamic primitive topology in to account with
+    *    3DSTATE_CLIP::ViewportXYClipTestEnable
+    */
+   VkPolygonMode dynamic_raster_mode =
+      genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
+                                dyn->ia.primitive_topology);
+   bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
+
+   struct GENX(3DSTATE_CLIP) clip = {
+      GENX(3DSTATE_CLIP_header),
+#if GFX_VER <= 7
+      .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
+      .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
+#endif
+      .ViewportXYClipTestEnable = xy_clip_test_enable,
+   };
+   uint32_t dwords[GENX(3DSTATE_CLIP_length)];
+
+   /* TODO(mesh): Multiview. */
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const struct brw_vue_prog_data *last =
+         anv_pipeline_get_last_vue_prog_data(pipeline);
+      if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
+         clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
+                               dyn->vp.viewport_count - 1 : 0;
+      }
+   } else if (anv_pipeline_is_mesh(pipeline)) {
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      if (mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+         clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
+                               dyn->vp.viewport_count - 1 : 0;
+      }
+   }
+
+   GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
+   anv_batch_emit_merge(&cmd_buffer->batch, dwords,
+                        pipeline->gfx7.clip);
+}
+
+static void
+cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t count = dyn->vp.viewport_count;
+   const VkViewport *viewports = dyn->vp.viewports;
+   struct anv_state sf_clip_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
+
+   bool negative_one_to_one =
+      cmd_buffer->state.gfx.pipeline->negative_one_to_one;
+
+   float scale = negative_one_to_one ? 0.5f : 1.0f;
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkViewport *vp = &viewports[i];
+
+      /* The gfx7 state struct has just the matrix and guardband fields, the
+       * gfx8 struct adds the min/max viewport fields. */
+      struct GENX(SF_CLIP_VIEWPORT) sfv = {
+         .ViewportMatrixElementm00 = vp->width / 2,
+         .ViewportMatrixElementm11 = vp->height / 2,
+         .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
+         .ViewportMatrixElementm30 = vp->x + vp->width / 2,
+         .ViewportMatrixElementm31 = vp->y + vp->height / 2,
+         .ViewportMatrixElementm32 = negative_one_to_one ?
+            (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
+         .XMinClipGuardband = -1.0f,
+         .XMaxClipGuardband = 1.0f,
+         .YMinClipGuardband = -1.0f,
+         .YMaxClipGuardband = 1.0f,
+#if GFX_VER >= 8
+         .XMinViewPort = vp->x,
+         .XMaxViewPort = vp->x + vp->width - 1,
+         .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
+         .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
+#endif
+      };
+
+      const uint32_t fb_size_max = 1 << 14;
+      uint32_t x_min = 0, x_max = fb_size_max;
+      uint32_t y_min = 0, y_max = fb_size_max;
+
+      /* If we have a valid renderArea, include that */
+      if (gfx->render_area.extent.width > 0 &&
+          gfx->render_area.extent.height > 0) {
+         x_min = MAX2(x_min, gfx->render_area.offset.x);
+         x_max = MIN2(x_min, gfx->render_area.offset.x +
+                             gfx->render_area.extent.width);
+         y_min = MAX2(y_min, gfx->render_area.offset.y);
+         y_max = MIN2(y_min, gfx->render_area.offset.y +
+                             gfx->render_area.extent.height);
+      }
+
+      /* The client is required to have enough scissors for whatever it sets
+       * as ViewportIndex but it's possible that they've got more viewports
+       * set from a previous command.  Also, from the Vulkan 1.3.207:
+       *
+       *    "The application must ensure (using scissor if necessary) that
+       *    all rendering is contained within the render area."
+       *
+       * If the client doesn't set a scissor, that basically means it
+       * guarantees everything is in-bounds already.  If we end up using a
+       * guardband of [-1, 1] in that case, there shouldn't be much loss.
+       * It's theoretically possible that they could do all their clipping
+       * with clip planes but that'd be a bit odd.
+       */
+      if (i < dyn->vp.scissor_count) {
+         const VkRect2D *scissor = &dyn->vp.scissors[i];
+         x_min = MAX2(x_min, scissor->offset.x);
+         x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
+         y_min = MAX2(y_min, scissor->offset.y);
+         y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
+      }
+
+      /* Only bother calculating the guardband if our known render area is
+       * less than the maximum size.  Otherwise, it will calculate [-1, 1]
+       * anyway but possibly with precision loss.
+       */
+      if (x_min > 0 || x_max < fb_size_max ||
+          y_min > 0 || y_max < fb_size_max) {
+         intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
+                                        sfv.ViewportMatrixElementm00,
+                                        sfv.ViewportMatrixElementm11,
+                                        sfv.ViewportMatrixElementm30,
+                                        sfv.ViewportMatrixElementm31,
+                                        &sfv.XMinClipGuardband,
+                                        &sfv.XMaxClipGuardband,
+                                        &sfv.YMinClipGuardband,
+                                        &sfv.YMaxClipGuardband);
+      }
+
+      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
+      clip.SFClipViewportPointer = sf_clip_state.offset;
+   }
+}
+
+static void
+cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
+                               bool depth_clamp_enable)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t count = dyn->vp.viewport_count;
+   const VkViewport *viewports = dyn->vp.viewports;
+   struct anv_state cc_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkViewport *vp = &viewports[i];
+
+      /* From the Vulkan spec:
+       *
+       *    "It is valid for minDepth to be greater than or equal to
+       *    maxDepth."
+       */
+      float min_depth = MIN2(vp->minDepth, vp->maxDepth);
+      float max_depth = MAX2(vp->minDepth, vp->maxDepth);
+
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
+         .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
+      };
+
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+      cc.CCViewportPointer = cc_state.offset;
+   }
+}
+
+static int64_t
+clamp_int64(int64_t x, int64_t min, int64_t max)
+{
+   if (x < min)
+      return min;
+   else if (x < max)
+      return x;
+   else
+      return max;
+}
+
+static void
+cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t count = dyn->vp.scissor_count;
+   const VkRect2D *scissors = dyn->vp.scissors;
+   const VkViewport *viewports = dyn->vp.viewports;
+
+   /* Wa_1409725701:
+    *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
+    *    stored as an array of up to 16 elements. The location of first
+    *    element of the array, as specified by Pointer to SCISSOR_RECT, should
+    *    be aligned to a 64-byte boundary.
+    */
+   uint32_t alignment = 64;
+   struct anv_state scissor_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
+
+   for (uint32_t i = 0; i < count; i++) {
+      const VkRect2D *s = &scissors[i];
+      const VkViewport *vp = &viewports[i];
+
+      /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
+       * ymax < ymin for empty clips.  In case clip x, y, width height are all
+       * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
+       * what we want. Just special case empty clips and produce a canonical
+       * empty clip. */
+      static const struct GENX(SCISSOR_RECT) empty_scissor = {
+         .ScissorRectangleYMin = 1,
+         .ScissorRectangleXMin = 1,
+         .ScissorRectangleYMax = 0,
+         .ScissorRectangleXMax = 0
+      };
+
+      const int max = 0xffff;
+
+      uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
+      uint32_t x_min = MAX2(s->offset.x, vp->x);
+      uint32_t y_max = MIN2(s->offset.y + s->extent.height - 1,
+                       MAX2(vp->y, vp->y + vp->height) - 1);
+      uint32_t x_max = MIN2(s->offset.x + s->extent.width - 1,
+                       vp->x + vp->width - 1);
+
+      /* Do this math using int64_t so overflow gets clamped correctly. */
+      if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+         y_min = clamp_int64((uint64_t) y_min, gfx->render_area.offset.y, max);
+         x_min = clamp_int64((uint64_t) x_min, gfx->render_area.offset.x, max);
+         y_max = clamp_int64((uint64_t) y_max, 0,
+                             gfx->render_area.offset.y +
+                             gfx->render_area.extent.height - 1);
+         x_max = clamp_int64((uint64_t) x_max, 0,
+                             gfx->render_area.offset.x +
+                             gfx->render_area.extent.width - 1);
+      }
+
+      struct GENX(SCISSOR_RECT) scissor = {
+         .ScissorRectangleYMin = y_min,
+         .ScissorRectangleXMin = x_min,
+         .ScissorRectangleYMax = y_max,
+         .ScissorRectangleXMax = x_max
+      };
+
+      if (s->extent.width <= 0 || s->extent.height <= 0) {
+         GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
+                                 &empty_scissor);
+      } else {
+         GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
+      ssp.ScissorRectPointer = scissor_state.offset;
+   }
+}
+
+static void
+cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+
+#if GFX_VER == 7
+#  define streamout_state_dw pipeline->gfx7.streamout_state
+#else
+#  define streamout_state_dw pipeline->gfx8.streamout_state
+#endif
+
+   uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
+
+   struct GENX(3DSTATE_STREAMOUT) so = {
+      GENX(3DSTATE_STREAMOUT_header),
+      .RenderingDisable = dyn->rs.rasterizer_discard_enable,
+   };
+   GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
+   anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
+}
+
+void
+genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t *p;
+
+   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   /* Apply any pending pipeline flushes we may have.  We want to apply them
+    * now because, if any of those flushes are for things like push constants,
+    * the GPU will read the state at weird times.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
+      vb_emit |= pipeline->vb_used;
+
+   if (vb_emit) {
+      const uint32_t num_buffers = __builtin_popcount(vb_emit);
+      const uint32_t num_dwords = 1 + num_buffers * 4;
+
+      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                          GENX(3DSTATE_VERTEX_BUFFERS));
+      uint32_t i = 0;
+      u_foreach_bit(vb, vb_emit) {
+         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+         struct GENX(VERTEX_BUFFER_STATE) state;
+         if (buffer) {
+            uint32_t stride = dyn->vi_binding_strides[vb];
+            UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
+
+#if GFX_VER <= 7
+            bool per_instance = pipeline->vb[vb].instanced;
+            uint32_t divisor = pipeline->vb[vb].instance_divisor *
+                               pipeline->instance_multiplier;
+#endif
+
+            state = (struct GENX(VERTEX_BUFFER_STATE)) {
+               .VertexBufferIndex = vb,
+
+               .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
+                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+#if GFX_VER <= 7
+               .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
+               .InstanceDataStepRate = per_instance ? divisor : 1,
+#endif
+               .AddressModifyEnable = true,
+               .BufferPitch = stride,
+               .BufferStartingAddress = anv_address_add(buffer->address, offset),
+               .NullVertexBuffer = offset >= buffer->vk.size,
+#if GFX_VER >= 12
+               .L3BypassDisable = true,
+#endif
+
+#if GFX_VER >= 8
+               .BufferSize = size,
+#else
+               /* XXX: to handle dynamic offset for older gens we might want
+                * to modify Endaddress, but there are issues when doing so:
+                *
+                * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
+                */
+               .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
+#endif
+            };
+         } else {
+            state = (struct GENX(VERTEX_BUFFER_STATE)) {
+               .VertexBufferIndex = vb,
+               .NullVertexBuffer = true,
+               .MOCS = anv_mocs(cmd_buffer->device, NULL,
+                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+            };
+         }
+
+#if GFX_VER >= 8 && GFX_VER <= 9
+         genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
+                                                        state.BufferStartingAddress,
+                                                        state.BufferSize);
+#endif
+
+         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         i++;
+      }
+   }
+
+   cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
+
+   uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
+                                pipeline->active_stages;
+   if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
+       !vk_dynamic_graphics_state_any_dirty(dyn) &&
+       !cmd_buffer->state.push_constants_dirty)
+      return;
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
+       (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
+                         ANV_CMD_DIRTY_PIPELINE))) {
+      /* Wa_16011411144:
+       *
+       * SW must insert a PIPE_CONTROL cmd before and after the
+       * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
+       * state is not combined with other state changes.
+       */
+      if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "before SO_BUFFER change WA");
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      }
+
+      /* We don't need any per-buffer dirty tracking because you're not
+       * allowed to bind different XFB buffers while XFB is enabled.
+       */
+      for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+         struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
+#if GFX_VER < 12
+            sob.SOBufferIndex = idx;
+#else
+            sob._3DCommandOpcode = 0;
+            sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
+#endif
+
+            if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
+               sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
+               sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
+                                                        xfb->offset);
+#if GFX_VER >= 8
+               sob.SOBufferEnable = true;
+               sob.StreamOffsetWriteEnable = false;
+               /* Size is in DWords - 1 */
+               sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
+#else
+               /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
+                * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
+                * default for an empty SO_BUFFER packet) to disable them.
+                */
+               sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
+               sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
+                                                       xfb->offset + xfb->size);
+#endif
+            } else {
+               sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+            }
+         }
+      }
+
+      if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
+         /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "after SO_BUFFER change WA");
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      } else if (GFX_VER >= 10) {
+         /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "after 3DSTATE_SO_BUFFER call");
+      }
+   }
+
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+
+      /* If the pipeline changed, we may need to re-allocate push constant
+       * space in the URB.
+       */
+      cmd_buffer_alloc_push_constants(cmd_buffer);
+   }
+
+#if GFX_VER <= 7
+   if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
+       cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
+      /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+       *
+       *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
+       *    stall needs to be sent just prior to any 3DSTATE_VS,
+       *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
+       *    3DSTATE_BINDING_TABLE_POINTER_VS,
+       *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
+       *    PIPE_CONTROL needs to be sent before any combination of VS
+       *    associated 3DSTATE."
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.DepthStallEnable  = true;
+         pc.PostSyncOperation = WriteImmediateData;
+         pc.Address           = cmd_buffer->device->workaround_address;
+         anv_debug_dump_pc(pc);
+      }
+   }
+#endif
+
+   /* Render targets live in the same binding table as fragment descriptors */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
+      descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+
+   /* We emit the binding tables and sampler tables first, then emit push
+    * constants and then finally emit binding table and sampler table
+    * pointers.  It has to happen in this order, since emitting the binding
+    * tables may change the push constants (in case of storage images). After
+    * emitting push constants, on SKL+ we have to emit the corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+    */
+   uint32_t dirty = 0;
+   if (descriptors_dirty) {
+      dirty = flush_descriptor_sets(cmd_buffer,
+                                    &cmd_buffer->state.gfx.base,
+                                    descriptors_dirty,
+                                    pipeline->shaders,
+                                    ARRAY_SIZE(pipeline->shaders));
+      cmd_buffer->state.descriptors_dirty &= ~dirty;
+   }
+
+   if (dirty || cmd_buffer->state.push_constants_dirty) {
+      /* Because we're pushing UBOs, we have to push whenever either
+       * descriptors or push constants is dirty.
+       */
+      dirty |= cmd_buffer->state.push_constants_dirty;
+      cmd_buffer_flush_push_constants(cmd_buffer,
+                                      dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+#if GFX_VERx10 >= 125
+      cmd_buffer_flush_mesh_inline_data(
+         cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_NV |
+                              VK_SHADER_STAGE_MESH_BIT_NV));
+#endif
+   }
+
+   if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      cmd_buffer_emit_descriptor_pointers(cmd_buffer,
+                                          dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+   }
+
+   cmd_buffer_emit_clip(cmd_buffer);
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_XFB_ENABLE)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
+      cmd_buffer_emit_streamout(cmd_buffer);
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_RENDER_TARGETS)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
+      cmd_buffer_emit_viewport(cmd_buffer);
+      cmd_buffer_emit_depth_viewport(cmd_buffer,
+                                     pipeline->depth_clamp_enable);
+      cmd_buffer_emit_scissor(cmd_buffer);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
+      uint32_t topology;
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+         topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
+      else
+         topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
+
+      cmd_buffer->state.gfx.primitive_topology = topology;
+
+#if (GFX_VER >= 8)
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
+         vft.PrimitiveTopologyType = topology;
+      }
+#endif
+   }
+
+   genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
+}
+
+static void
+emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_address addr,
+               uint32_t size, uint32_t index)
+{
+   uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
+                                 GENX(3DSTATE_VERTEX_BUFFERS));
+
+   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
+      &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex = index,
+         .AddressModifyEnable = true,
+         .BufferPitch = 0,
+         .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
+                          ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+         .NullVertexBuffer = size == 0,
+#if GFX_VER >= 12
+         .L3BypassDisable = true,
+#endif
+#if (GFX_VER >= 8)
+         .BufferStartingAddress = addr,
+         .BufferSize = size
+#else
+         .BufferStartingAddress = addr,
+         .EndAddress = anv_address_add(addr, size),
+#endif
+      });
+
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
+                                                  index, addr, size);
+}
+
+static void
+emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_address addr)
+{
+   emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+}
+
+static void
+emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
+                          uint32_t base_vertex, uint32_t base_instance)
+{
+   if (base_vertex == 0 && base_instance == 0) {
+      emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
+   } else {
+      struct anv_state id_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
+
+      ((uint32_t *)id_state.map)[0] = base_vertex;
+      ((uint32_t *)id_state.map)[1] = base_instance;
+
+      struct anv_address addr = {
+         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         .offset = id_state.offset,
+      };
+
+      emit_base_vertex_instance_bo(cmd_buffer, addr);
+   }
+}
+
+static void
+emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+{
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
+
+   ((uint32_t *)state.map)[0] = draw_index;
+
+   struct anv_address addr = {
+      .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+      .offset = state.offset,
+   };
+
+   emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
+}
+
+static void
+update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t access_type)
+{
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint64_t vb_used = pipeline->vb_used;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+   if (vs_prog_data->uses_drawid)
+      vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
+                                                       access_type == RANDOM,
+                                                       vb_used);
+}
+
+ALWAYS_INLINE static void
+cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
+                                           const struct brw_vs_prog_data *vs_prog_data,
+                                           uint32_t base_vertex,
+                                           uint32_t base_instance,
+                                           uint32_t draw_id,
+                                           bool force_flush)
+{
+   bool emitted = false;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance) {
+      emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
+      emitted = true;
+   }
+   if (vs_prog_data->uses_drawid) {
+      emit_draw_index(cmd_buffer, draw_id);
+      emitted = true;
+   }
+   /* Emitting draw index or vertex index BOs may result in needing
+    * additional VF cache flushes.
+    */
+   if (emitted || force_flush)
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+
+void genX(CmdDraw)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      vertexCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw", count);
+   trace_intel_begin_draw(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+                                              firstVertex, firstInstance, 0,
+                                              true);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      prim.VertexCountPerInstance   = vertexCount;
+      prim.StartVertexLocation      = firstVertex;
+      prim.InstanceCount            = instanceCount *
+                                      pipeline->instance_multiplier;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = 0;
+   }
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+   trace_intel_end_draw(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawInfoEXT                   *pVertexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      drawCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw_multi", count);
+   trace_intel_begin_draw_multi(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   uint32_t i = 0;
+   vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+      cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+                                                 draw->firstVertex,
+                                                 firstInstance, i, !i);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+         prim.VertexCountPerInstance   = draw->vertexCount;
+         prim.StartVertexLocation      = draw->firstVertex;
+         prim.InstanceCount            = instanceCount *
+                                         pipeline->instance_multiplier;
+         prim.StartInstanceLocation    = firstInstance;
+         prim.BaseVertexLocation       = 0;
+      }
+   }
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+   trace_intel_end_draw_multi(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawIndexed)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      indexCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed",
+                        count);
+   trace_intel_begin_draw_indexed(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+      prim.VertexAccessType         = RANDOM;
+      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      prim.VertexCountPerInstance   = indexCount;
+      prim.StartVertexLocation      = firstIndex;
+      prim.InstanceCount            = instanceCount *
+                                      pipeline->instance_multiplier;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = vertexOffset;
+   }
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+   trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiIndexedEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride,
+    const int32_t                              *pVertexOffset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      drawCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed_multi",
+                        count);
+   trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   uint32_t i = 0;
+   if (pVertexOffset) {
+      if (vs_prog_data->uses_drawid) {
+         bool emitted = true;
+         if (vs_prog_data->uses_firstvertex ||
+             vs_prog_data->uses_baseinstance) {
+            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+            emitted = true;
+         }
+         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+            if (vs_prog_data->uses_drawid) {
+               emit_draw_index(cmd_buffer, i);
+               emitted = true;
+            }
+            /* Emitting draw index or vertex index BOs may result in needing
+             * additional VF cache flushes.
+             */
+            if (emitted)
+               genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+               prim.VertexAccessType         = RANDOM;
+               prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+               prim.VertexCountPerInstance   = draw->indexCount;
+               prim.StartVertexLocation      = draw->firstIndex;
+               prim.InstanceCount            = instanceCount *
+                                               pipeline->instance_multiplier;
+               prim.StartInstanceLocation    = firstInstance;
+               prim.BaseVertexLocation       = *pVertexOffset;
+            }
+            emitted = false;
+         }
+      } else {
+         if (vs_prog_data->uses_firstvertex ||
+             vs_prog_data->uses_baseinstance) {
+            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+            /* Emitting draw index or vertex index BOs may result in needing
+             * additional VF cache flushes.
+             */
+            genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+         }
+         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+               prim.VertexAccessType         = RANDOM;
+               prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+               prim.VertexCountPerInstance   = draw->indexCount;
+               prim.StartVertexLocation      = draw->firstIndex;
+               prim.InstanceCount            = instanceCount *
+                                               pipeline->instance_multiplier;
+               prim.StartInstanceLocation    = firstInstance;
+               prim.BaseVertexLocation       = *pVertexOffset;
+            }
+         }
+      }
+   } else {
+      vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+         cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+                                                    draw->vertexOffset,
+                                                    firstInstance, i, i != 0);
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+            prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+            prim.VertexAccessType         = RANDOM;
+            prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+            prim.VertexCountPerInstance   = draw->indexCount;
+            prim.StartVertexLocation      = draw->firstIndex;
+            prim.InstanceCount            = instanceCount *
+                                            pipeline->instance_multiplier;
+            prim.StartInstanceLocation    = firstInstance;
+            prim.BaseVertexLocation       = draw->vertexOffset;
+         }
+      }
+   }
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+   trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GFX7_3DPRIM_END_OFFSET          0x2420
+#define GFX7_3DPRIM_START_VERTEX        0x2430
+#define GFX7_3DPRIM_VERTEX_COUNT        0x2434
+#define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
+#define GFX7_3DPRIM_START_INSTANCE      0x243C
+#define GFX7_3DPRIM_BASE_VERTEX         0x2440
+
+void genX(CmdDrawIndirectByteCountEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    VkBuffer                                    counterBuffer,
+    VkDeviceSize                                counterBufferOffset,
+    uint32_t                                    counterOffset,
+    uint32_t                                    vertexStride)
+{
+#if GFX_VERx10 >= 75
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   /* firstVertex is always zero for this draw function */
+   const uint32_t firstVertex = 0;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect byte count",
+                        instanceCount * pipeline->instance_multiplier);
+   trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
+   if (vs_prog_data->uses_drawid)
+      emit_draw_index(cmd_buffer, 0);
+
+   /* Emitting draw index or vertex index BOs may result in needing
+    * additional VF cache flushes.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   struct mi_value count =
+      mi_mem32(anv_address_add(counter_buffer->address,
+                                   counterBufferOffset));
+   if (counterOffset)
+      count = mi_isub(&b, count, mi_imm(counterOffset));
+   count = mi_udiv32_imm(&b, count, vertexStride);
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
+            mi_imm(instanceCount * pipeline->instance_multiplier));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.IndirectParameterEnable  = true;
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+   }
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+   trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
+      instanceCount * pipeline->instance_multiplier);
+#endif /* GFX_VERx10 >= 75 */
+}
+
+static void
+load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_address addr,
+                         bool indexed)
+{
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
+                mi_mem32(anv_address_add(addr, 0)));
+
+   struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
+   if (pipeline->instance_multiplier > 1) {
+#if GFX_VERx10 >= 75
+      instance_count = mi_imul_imm(&b, instance_count,
+                                   pipeline->instance_multiplier);
+#else
+      anv_finishme("Multiview + indirect draw requires MI_MATH; "
+                   "MI_MATH is not supported on Ivy Bridge");
+#endif
+   }
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
+                mi_mem32(anv_address_add(addr, 8)));
+
+   if (indexed) {
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
+                   mi_mem32(anv_address_add(addr, 12)));
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+                   mi_mem32(anv_address_add(addr, 16)));
+   } else {
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+                   mi_mem32(anv_address_add(addr, 12)));
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+   }
+}
+
+void genX(CmdDrawIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect",
+                        drawCount);
+   trace_intel_begin_draw_indirect(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      load_indirect_parameters(cmd_buffer, draw, false);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      }
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+      offset += stride;
+   }
+
+   trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
+}
+
+void genX(CmdDrawIndexedIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed indirect",
+                        drawCount);
+   trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      /* TODO: We need to stomp base vertex to 0 somehow */
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      load_indirect_parameters(cmd_buffer, draw, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = RANDOM;
+         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      }
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+      offset += stride;
+   }
+
+   trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
+}
+
+static struct mi_value
+prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+                                 struct mi_builder *b,
+                                 struct anv_buffer *count_buffer,
+                                 uint64_t countBufferOffset)
+{
+   struct anv_address count_address =
+         anv_address_add(count_buffer->address, countBufferOffset);
+
+   struct mi_value ret = mi_imm(0);
+
+   if (cmd_buffer->state.conditional_render_enabled) {
+#if GFX_VERx10 >= 75
+      ret = mi_new_gpr(b);
+      mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
+#endif
+   } else {
+      /* Upload the current draw count from the draw parameters buffer to
+       * MI_PREDICATE_SRC0.
+       */
+      mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
+      mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
+   }
+
+   return ret;
+}
+
+static void
+emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+                          struct mi_builder *b,
+                          uint32_t draw_index)
+{
+   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
+   mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
+
+   if (draw_index == 0) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOADINV;
+         mip.CombineOperation = COMBINE_SET;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      }
+   } else {
+      /* While draw_index < draw_count the predicate's result will be
+       *  (draw_index == draw_count) ^ TRUE = TRUE
+       * When draw_index == draw_count the result is
+       *  (TRUE) ^ TRUE = FALSE
+       * After this all results will be:
+       *  (FALSE) ^ FALSE = FALSE
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOAD;
+         mip.CombineOperation = COMBINE_XOR;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      }
+   }
+}
+
+#if GFX_VERx10 >= 75
+static void
+emit_draw_count_predicate_with_conditional_render(
+                          struct anv_cmd_buffer *cmd_buffer,
+                          struct mi_builder *b,
+                          uint32_t draw_index,
+                          struct mi_value max)
+{
+   struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
+   pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
+
+#if GFX_VER >= 8
+   mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
+#else
+   /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
+    * so we emit MI_PREDICATE to set it.
+    */
+
+   mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
+   mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+#endif
+}
+#endif
+
+static void
+emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
+                               struct mi_builder *b,
+                               uint32_t draw_index,
+                               struct mi_value max)
+{
+#if GFX_VERx10 >= 75
+   if (cmd_buffer->state.conditional_render_enabled) {
+      emit_draw_count_predicate_with_conditional_render(
+            cmd_buffer, b, draw_index, mi_value_ref(b, max));
+   } else {
+      emit_draw_count_predicate(cmd_buffer, b, draw_index);
+   }
+#else
+   emit_draw_count_predicate(cmd_buffer, b, draw_index);
+#endif
+}
+
+void genX(CmdDrawIndirectCount)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect count",
+                        0);
+   trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   struct mi_value max =
+      prepare_for_draw_count_predicate(cmd_buffer, &b,
+                                       count_buffer, countBufferOffset);
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      load_indirect_parameters(cmd_buffer, draw, false);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = true;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      }
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+      offset += stride;
+   }
+
+   mi_value_unref(&b, max);
+
+   trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
+}
+
+void genX(CmdDrawIndexedIndirectCount)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed indirect count",
+                        0);
+   trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   struct mi_value max =
+      prepare_for_draw_count_predicate(cmd_buffer, &b,
+                                       count_buffer, countBufferOffset);
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+      /* TODO: We need to stomp base vertex to 0 somehow */
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      load_indirect_parameters(cmd_buffer, draw, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = true;
+         prim.VertexAccessType         = RANDOM;
+         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      }
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+      offset += stride;
+   }
+
+   mi_value_unref(&b, max);
+
+   trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
+
+}
+
+void genX(CmdBeginTransformFeedbackEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstCounterBuffer,
+    uint32_t                                    counterBufferCount,
+    const VkBuffer*                             pCounterBuffers,
+    const VkDeviceSize*                         pCounterBufferOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+   assert(counterBufferCount <= MAX_XFB_BUFFERS);
+   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+    *
+    *    "Ssoftware must ensure that no HW stream output operations can be in
+    *    process or otherwise pending at the point that the MI_LOAD/STORE
+    *    commands are processed. This will likely require a pipeline flush."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_CS_STALL_BIT,
+                             "begin transform feedback");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+      /* If we have a counter buffer, this is a resume so we need to load the
+       * value into the streamout offset register.  Otherwise, this is a begin
+       * and we need to reset it to zero.
+       */
+      if (pCounterBuffers &&
+          idx >= firstCounterBuffer &&
+          idx - firstCounterBuffer < counterBufferCount &&
+          pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
+         uint32_t cb_idx = idx - firstCounterBuffer;
+         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+         uint64_t offset = pCounterBufferOffsets ?
+                           pCounterBufferOffsets[cb_idx] : 0;
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+            lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
+                                                   offset);
+         }
+      } else {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+            lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+            lri.DataDWord        = 0;
+         }
+      }
+   }
+
+   cmd_buffer->state.xfb_enabled = true;
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+void genX(CmdEndTransformFeedbackEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstCounterBuffer,
+    uint32_t                                    counterBufferCount,
+    const VkBuffer*                             pCounterBuffers,
+    const VkDeviceSize*                         pCounterBufferOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+   assert(counterBufferCount <= MAX_XFB_BUFFERS);
+   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+    *
+    *    "Ssoftware must ensure that no HW stream output operations can be in
+    *    process or otherwise pending at the point that the MI_LOAD/STORE
+    *    commands are processed. This will likely require a pipeline flush."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_CS_STALL_BIT,
+                             "end transform feedback");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
+      unsigned idx = firstCounterBuffer + cb_idx;
+
+      /* If we have a counter buffer, this is a resume so we need to load the
+       * value into the streamout offset register.  Otherwise, this is a begin
+       * and we need to reset it to zero.
+       */
+      if (pCounterBuffers &&
+          cb_idx < counterBufferCount &&
+          pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
+         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+         uint64_t offset = pCounterBufferOffsets ?
+                           pCounterBufferOffsets[cb_idx] : 0;
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+            srm.MemoryAddress    = anv_address_add(counter_buffer->address,
+                                                   offset);
+            srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+         }
+      }
+   }
+
+   cmd_buffer->state.xfb_enabled = false;
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+#if GFX_VERx10 >= 125
+void
+genX(CmdDrawMeshTasksNV)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    taskCount,
+    uint32_t                                    firstTask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   /* TODO(mesh): Check if this is not emitting more packets than we need. */
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* BSpec 54016 says: "The values passed for Starting ThreadGroup ID X
+    * and ThreadGroup Count X shall not cause TGIDs to exceed (2^32)-1."
+    */
+   assert((int64_t)firstTask + taskCount - 1 <= UINT32_MAX);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_1D), m) {
+      m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+      m.ThreadGroupCountX = taskCount;
+      m.StartingThreadGroupIDX = firstTask;
+   }
+}
+
+#define GFX125_3DMESH_TG_COUNT 0x26F0
+#define GFX125_3DMESH_STARTING_TGID 0x26F4
+#define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
+
+static void
+mesh_load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
+                              struct mi_builder *b,
+                              struct anv_address addr,
+                              bool emit_xp0,
+                              uint32_t xp0)
+{
+   const size_t taskCountOff = offsetof(VkDrawMeshTasksIndirectCommandNV, taskCount);
+   const size_t firstTaskOff = offsetof(VkDrawMeshTasksIndirectCommandNV, firstTask);
+
+   mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
+               mi_mem32(anv_address_add(addr, taskCountOff)));
+
+   mi_store(b, mi_reg32(GFX125_3DMESH_STARTING_TGID),
+               mi_mem32(anv_address_add(addr, firstTaskOff)));
+
+   if (emit_xp0)
+      mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
+}
+
+static void
+emit_indirect_3dmesh_1d(struct anv_batch *batch,
+                        bool predicate_enable,
+                        bool uses_drawid)
+{
+   uint32_t len = GENX(3DMESH_1D_length) + uses_drawid;
+   uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_1D),
+                   .PredicateEnable           = predicate_enable,
+                   .IndirectParameterEnable   = true,
+                   .ExtendedParameter0Present = uses_drawid);
+   if (uses_drawid)
+      dw[len - 1] = 0;
+}
+
+void
+genX(CmdDrawMeshTasksIndirectNV)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   if (cmd_state->conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+                       mesh_prog_data->uses_drawid;
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
+
+      emit_indirect_3dmesh_1d(&cmd_buffer->batch,
+            cmd_state->conditional_render_enabled, uses_drawid);
+
+      offset += stride;
+   }
+}
+
+void
+genX(CmdDrawMeshTasksIndirectCountNV)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+                       mesh_prog_data->uses_drawid;
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   struct mi_value max =
+         prepare_for_draw_count_predicate(cmd_buffer, &b,
+                                          count_buffer, countBufferOffset);
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+      mesh_load_indirect_parameters(cmd_buffer, &b, draw, uses_drawid, i);
+
+      emit_indirect_3dmesh_1d(&cmd_buffer->batch, true, uses_drawid);
+
+      offset += stride;
+   }
+}
+#endif /* GFX_VERx10 >= 125 */
+
+void
+genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+   struct anv_compute_pipeline *pipeline = comp_state->pipeline;
+
+   assert(pipeline->cs);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Apply any pending pipeline flushes we may have.  We want to apply them
+    * now because, if any of those flushes are for things like push constants,
+    * the GPU will read the state at weird times.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (cmd_buffer->state.compute.pipeline_dirty) {
+      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+       *
+       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *    the only bits that are changed are scoreboard related: Scoreboard
+       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
+       *    sufficient."
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                              ANV_PIPE_CS_STALL_BIT,
+                              "flush compute state");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+
+      /* The workgroup size of the pipeline affects our push constant layout
+       * so flag push constants as dirty if we change the pipeline.
+       */
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+       cmd_buffer->state.compute.pipeline_dirty) {
+      flush_descriptor_sets(cmd_buffer,
+                            &cmd_buffer->state.compute.base,
+                            VK_SHADER_STAGE_COMPUTE_BIT,
+                            &pipeline->cs, 1);
+      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+
+#if GFX_VERx10 < 125
+      uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+      struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+         .BindingTablePointer =
+            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+         .SamplerStatePointer =
+            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+      };
+      GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
+
+      struct anv_state state =
+         anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
+                                      pipeline->interface_descriptor_data,
+                                      GENX(INTERFACE_DESCRIPTOR_DATA_length),
+                                      64);
+
+      uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+         mid.InterfaceDescriptorTotalLength        = size;
+         mid.InterfaceDescriptorDataStartAddress   = state.offset;
+      }
+#endif
+   }
+
+   if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
+      comp_state->push_data =
+         anv_cmd_buffer_cs_push_constants(cmd_buffer);
+
+#if GFX_VERx10 < 125
+      if (comp_state->push_data.alloc_size) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+            curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
+            curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
+         }
+      }
+#endif
+
+      cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   cmd_buffer->state.compute.pipeline_dirty = false;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+
+#if GFX_VER == 7
+
+static VkResult
+verify_cmd_parser(const struct anv_device *device,
+                  int required_version,
+                  const char *function)
+{
+   if (device->physical->cmd_parser_version < required_version) {
+      return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
+                       "cmd parser version %d is required for %s",
+                       required_version, function);
+   } else {
+      return VK_SUCCESS;
+   }
+}
+
+#endif
+
+static void
+anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t baseGroupX,
+                                  uint32_t baseGroupY,
+                                  uint32_t baseGroupZ)
+{
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   struct anv_push_constants *push =
+      &cmd_buffer->state.compute.base.push_constants;
+   if (push->cs.base_work_group_id[0] != baseGroupX ||
+       push->cs.base_work_group_id[1] != baseGroupY ||
+       push->cs.base_work_group_id[2] != baseGroupZ) {
+      push->cs.base_work_group_id[0] = baseGroupX;
+      push->cs.base_work_group_id[1] = baseGroupY;
+      push->cs.base_work_group_id[2] = baseGroupZ;
+
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+}
+
+void genX(CmdDispatch)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    x,
+    uint32_t                                    y,
+    uint32_t                                    z)
+{
+   genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
+}
+
+#if GFX_VERx10 >= 125
+
+static inline void
+emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
+                    const struct anv_compute_pipeline *pipeline, bool indirect,
+                    const struct brw_cs_prog_data *prog_data,
+                    uint32_t groupCountX, uint32_t groupCountY,
+                    uint32_t groupCountZ)
+{
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+   const struct anv_shader_bin *cs_bin = pipeline->cs;
+   bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+      cw.IndirectParameterEnable        = indirect;
+      cw.PredicateEnable                = predicate;
+      cw.SIMDSize                       = dispatch.simd_size / 16;
+      cw.IndirectDataStartAddress       = comp_state->push_data.offset;
+      cw.IndirectDataLength             = comp_state->push_data.alloc_size;
+      cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
+      cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
+      cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
+      cw.ThreadGroupIDXDimension        = groupCountX;
+      cw.ThreadGroupIDYDimension        = groupCountY;
+      cw.ThreadGroupIDZDimension        = groupCountZ;
+      cw.ExecutionMask                  = dispatch.right_mask;
+      cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
+
+      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+         .KernelStartPointer = cs_bin->kernel.offset,
+         .SamplerStatePointer =
+            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+         .BindingTablePointer =
+            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+         .BindingTableEntryCount =
+            1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
+         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+         .SharedLocalMemorySize = encode_slm_size(GFX_VER,
+                                                  prog_data->base.total_shared),
+         .NumberOfBarriers = prog_data->uses_barrier,
+      };
+   }
+}
+
+#else /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
+                  const struct anv_compute_pipeline *pipeline, bool indirect,
+                  const struct brw_cs_prog_data *prog_data,
+                  uint32_t groupCountX, uint32_t groupCountY,
+                  uint32_t groupCountZ)
+{
+   bool predicate = (GFX_VER <= 7 && indirect) ||
+      cmd_buffer->state.conditional_render_enabled;
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = indirect;
+      ggw.PredicateEnable              = predicate;
+      ggw.SIMDSize                     = dispatch.simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
+      ggw.ThreadGroupIDXDimension      = groupCountX;
+      ggw.ThreadGroupIDYDimension      = groupCountY;
+      ggw.ThreadGroupIDZDimension      = groupCountZ;
+      ggw.RightExecutionMask           = dispatch.right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
+#endif /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
+               const struct anv_compute_pipeline *pipeline, bool indirect,
+               const struct brw_cs_prog_data *prog_data,
+               uint32_t groupCountX, uint32_t groupCountY,
+               uint32_t groupCountZ)
+{
+#if GFX_VERx10 >= 125
+   emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
+                       groupCountY, groupCountZ);
+#else
+   emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
+                     groupCountY, groupCountZ);
+#endif
+}
+
+void genX(CmdDispatchBase)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    baseGroupX,
+    uint32_t                                    baseGroupY,
+    uint32_t                                    baseGroupZ,
+    uint32_t                                    groupCountX,
+    uint32_t                                    groupCountY,
+    uint32_t                                    groupCountZ)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+
+   anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
+                                     baseGroupY, baseGroupZ);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_COMPUTE,
+                        "compute",
+                        groupCountX * groupCountY * groupCountZ *
+                        prog_data->local_size[0] * prog_data->local_size[1] *
+                        prog_data->local_size[2]);
+
+   trace_intel_begin_compute(&cmd_buffer->trace);
+
+   if (prog_data->uses_num_work_groups) {
+      struct anv_state state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
+      uint32_t *sizes = state.map;
+      sizes[0] = groupCountX;
+      sizes[1] = groupCountY;
+      sizes[2] = groupCountZ;
+      cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
+         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         .offset = state.offset,
+      };
+
+      /* The num_workgroups buffer goes in the binding table */
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
+                  groupCountY, groupCountZ);
+
+   trace_intel_end_compute(&cmd_buffer->trace,
+                           groupCountX, groupCountY, groupCountZ);
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+   struct anv_address addr = anv_address_add(buffer->address, offset);
+   UNUSED struct anv_batch *batch = &cmd_buffer->batch;
+
+   anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
+
+#if GFX_VER == 7
+   /* Linux 4.4 added command parser version 5 which allows the GPGPU
+    * indirect dispatch registers to be written.
+    */
+   if (verify_cmd_parser(cmd_buffer->device, 5,
+                         "vkCmdDispatchIndirect") != VK_SUCCESS)
+      return;
+#endif
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_COMPUTE,
+                        "compute indirect",
+                        0);
+   trace_intel_begin_compute(&cmd_buffer->trace);
+
+   if (prog_data->uses_num_work_groups) {
+      cmd_buffer->state.compute.num_workgroups = addr;
+
+      /* The num_workgroups buffer goes in the binding table */
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
+   struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
+   struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
+
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
+
+#if GFX_VER <= 7
+   /* predicate = (compute_dispatch_indirect_x_size == 0); */
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
+   mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
+   mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   /* predicate = !predicate; */
+   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_OR;
+      mip.CompareOperation = COMPARE_FALSE;
+   }
+
+#if GFX_VERx10 == 75
+   if (cmd_buffer->state.conditional_render_enabled) {
+      /* predicate &= !(conditional_rendering_predicate == 0); */
+      mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
+                   mi_reg32(ANV_PREDICATE_RESULT_REG));
+      anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOADINV;
+         mip.CombineOperation = COMBINE_AND;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      }
+   }
+#endif
+
+#else /* GFX_VER > 7 */
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+#endif
+
+   emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
+
+   trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
+}
+
+struct anv_state
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         BRW_RT_DISPATCH_GLOBALS_SIZE,
+                                         64);
+   struct brw_rt_scratch_layout layout;
+   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
+                                       * some cases?
+                                       */
+   brw_rt_compute_scratch_layout(&layout, device->info,
+                                 stack_ids_per_dss, 1 << 10);
+
+   struct GFX_RT_DISPATCH_GLOBALS rtdg = {
+      .MemBaseAddress = (struct anv_address) {
+         /* The ray query HW computes offsets from the top of the buffer, so
+          * let the address at the end of the buffer.
+          */
+         .bo = device->ray_query_bo,
+         .offset = device->ray_query_bo->size
+      },
+      .AsyncRTStackSize = layout.ray_stack_stride / 64,
+      .NumDSSRTStacks = layout.stack_ids_per_dss,
+      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+      .ResumeShaderTable = (struct anv_address) {
+         .bo = cmd_buffer->state.ray_query_shadow_bo,
+      },
+   };
+   GFX_RT_DISPATCH_GLOBALS_pack(NULL, state.map, &rtdg);
+
+   return state;
+#else
+   unreachable("Not supported");
+#endif
+}
+
+#if GFX_VERx10 >= 125
+static void
+calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
+{
+   unsigned total_shift = 0;
+   memset(local_shift, 0, 3);
+
+   bool progress;
+   do {
+      progress = false;
+      for (unsigned i = 0; i < 3; i++) {
+         assert(global[i] > 0);
+         if ((1 << local_shift[i]) < global[i]) {
+            progress = true;
+            local_shift[i]++;
+            total_shift++;
+         }
+
+         if (total_shift == 3)
+            return;
+      }
+   } while(progress);
+
+   /* Assign whatever's left to x */
+   local_shift[0] += 3 - total_shift;
+}
+
+static struct GFX_RT_SHADER_TABLE
+vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
+{
+   return (struct GFX_RT_SHADER_TABLE) {
+      .BaseAddress = anv_address_from_u64(region->deviceAddress),
+      .Stride = region->stride,
+   };
+}
+
+static void
+cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
+                      const VkStridedDeviceAddressRegionKHR *raygen_sbt,
+                      const VkStridedDeviceAddressRegionKHR *miss_sbt,
+                      const VkStridedDeviceAddressRegionKHR *hit_sbt,
+                      const VkStridedDeviceAddressRegionKHR *callable_sbt,
+                      bool is_indirect,
+                      uint32_t launch_width,
+                      uint32_t launch_height,
+                      uint32_t launch_depth,
+                      uint64_t launch_size_addr)
+{
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+   struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   /* If we have a known degenerate launch size, just bail */
+   if (!is_indirect &&
+       (launch_width == 0 || launch_height == 0 || launch_depth == 0))
+      return;
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   cmd_buffer->state.rt.pipeline_dirty = false;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   /* Add these to the reloc list as they're internal buffers that don't
+    * actually have relocs to pick them up manually.
+    *
+    * TODO(RT): This is a bit of a hack
+    */
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         cmd_buffer->batch.alloc,
+                         rt->scratch.bo);
+
+   /* Allocate and set up our RT_DISPATCH_GLOBALS */
+   struct anv_state rtdg_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                         BRW_RT_PUSH_CONST_OFFSET +
+                                         sizeof(struct anv_push_constants),
+                                         64);
+
+   struct GFX_RT_DISPATCH_GLOBALS rtdg = {
+      .MemBaseAddress = (struct anv_address) {
+         .bo = rt->scratch.bo,
+         .offset = rt->scratch.layout.ray_stack_start,
+      },
+      .CallStackHandler =
+         anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
+      .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
+      .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
+      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+      .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
+      .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
+      .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
+      .LaunchWidth = launch_width,
+      .LaunchHeight = launch_height,
+      .LaunchDepth = launch_depth,
+      .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
+   };
+   GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
+
+   /* Push constants go after the RT_DISPATCH_GLOBALS */
+   assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
+   memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
+          &cmd_buffer->state.rt.base.push_constants,
+          sizeof(struct anv_push_constants));
+
+   struct anv_address rtdg_addr = {
+      .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+      .offset = rtdg_state.offset,
+   };
+
+   uint8_t local_size_log2[3];
+   uint32_t global_size[3] = {};
+   if (is_indirect) {
+      /* Pick a local size that's probably ok.  We assume most TraceRays calls
+       * will use a two-dimensional dispatch size.  Worst case, our initial
+       * dispatch will be a little slower than it has to be.
+       */
+      local_size_log2[0] = 2;
+      local_size_log2[1] = 1;
+      local_size_log2[2] = 0;
+
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+      struct mi_value launch_size[3] = {
+         mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
+         mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
+         mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
+      };
+
+      /* Store the original launch size into RT_DISPATCH_GLOBALS
+       *
+       * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
+       * moved into a genX version.
+       */
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
+               mi_value_ref(&b, launch_size[0]));
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
+               mi_value_ref(&b, launch_size[1]));
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
+               mi_value_ref(&b, launch_size[2]));
+
+      /* Compute the global dispatch size */
+      for (unsigned i = 0; i < 3; i++) {
+         if (local_size_log2[i] == 0)
+            continue;
+
+         /* global_size = DIV_ROUND_UP(launch_size, local_size)
+          *
+          * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
+          * has the semantics of shifting the enture 64-bit value and taking
+          * the bottom 32 so we don't have to worry about roll-over.
+          */
+         uint32_t local_size = 1 << local_size_log2[i];
+         launch_size[i] = mi_iadd(&b, launch_size[i],
+                                      mi_imm(local_size - 1));
+         launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
+                                            local_size_log2[i]);
+      }
+
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
+   } else {
+      uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
+      calc_local_trace_size(local_size_log2, launch_size);
+
+      for (unsigned i = 0; i < 3; i++) {
+         /* We have to be a bit careful here because DIV_ROUND_UP adds to the
+          * numerator value may overflow.  Cast to uint64_t to avoid this.
+          */
+         uint32_t local_size = 1 << local_size_log2[i];
+         global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+      cw.IndirectParameterEnable        = is_indirect;
+      cw.PredicateEnable                = false;
+      cw.SIMDSize                       = SIMD8;
+      cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
+      cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
+      cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
+      cw.ThreadGroupIDXDimension        = global_size[0];
+      cw.ThreadGroupIDYDimension        = global_size[1];
+      cw.ThreadGroupIDZDimension        = global_size[2];
+      cw.ExecutionMask                  = 0xff;
+      cw.EmitInlineParameter            = true;
+      cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
+
+      const gl_shader_stage s = MESA_SHADER_RAYGEN;
+      struct anv_device *device = cmd_buffer->device;
+      struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
+      struct anv_state *samplers = &cmd_buffer->state.samplers[s];
+      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+         .KernelStartPointer = device->rt_trampoline->kernel.offset,
+         .SamplerStatePointer = samplers->offset,
+         /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
+         .SamplerCount = 0,
+         .BindingTablePointer = surfaces->offset,
+         .NumberofThreadsinGPGPUThreadGroup = 1,
+         .BTDMode = true,
+      };
+
+      struct brw_rt_raygen_trampoline_params trampoline_params = {
+         .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
+         .raygen_bsr_addr = raygen_sbt->deviceAddress,
+         .is_indirect = is_indirect,
+         .local_group_size_log2 = {
+            local_size_log2[0],
+            local_size_log2[1],
+            local_size_log2[2],
+         },
+      };
+      STATIC_ASSERT(sizeof(trampoline_params) == 32);
+      memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
+   }
+}
+
+void
+genX(CmdTraceRaysKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
+    uint32_t                                    width,
+    uint32_t                                    height,
+    uint32_t                                    depth)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_trace_rays(cmd_buffer,
+                         pRaygenShaderBindingTable,
+                         pMissShaderBindingTable,
+                         pHitShaderBindingTable,
+                         pCallableShaderBindingTable,
+                         false /* is_indirect */,
+                         width, height, depth,
+                         0 /* launch_size_addr */);
+}
+
+void
+genX(CmdTraceRaysIndirectKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
+    VkDeviceAddress                             indirectDeviceAddress)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer_trace_rays(cmd_buffer,
+                         pRaygenShaderBindingTable,
+                         pMissShaderBindingTable,
+                         pHitShaderBindingTable,
+                         pCallableShaderBindingTable,
+                         true /* is_indirect */,
+                         0, 0, 0, /* width, height, depth, */
+                         indirectDeviceAddress);
+}
+#endif /* GFX_VERx10 >= 125 */
+
+static void
+genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
+                            uint32_t pipeline)
+{
+   UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   if (cmd_buffer->state.current_pipeline == pipeline)
+      return;
+
+#if GFX_VER >= 8 && GFX_VER < 10
+   /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
+    *
+    *   Software must clear the COLOR_CALC_STATE Valid field in
+    *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
+    *   with Pipeline Select set to GPGPU.
+    *
+    * The internal hardware docs recommend the same workaround for Gfx9
+    * hardware too.
+    */
+   if (pipeline == GPGPU)
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
+#endif
+
+#if GFX_VER == 9
+   if (pipeline == _3D) {
+      /* There is a mid-object preemption workaround which requires you to
+       * re-emit MEDIA_VFE_STATE after switching from GPGPU to 3D.  However,
+       * even without preemption, we have issues with geometry flickering when
+       * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
+       * really know why.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
+         vfe.MaximumNumberofThreads =
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
+         vfe.NumberofURBEntries     = 2;
+         vfe.URBEntryAllocationSize = 2;
+      }
+
+      /* We just emitted a dummy MEDIA_VFE_STATE so now that packet is
+       * invalid. Set the compute pipeline to dirty to force a re-emit of the
+       * pipeline in case we get back-to-back dispatch calls with the same
+       * pipeline and a PIPELINE_SELECT in between.
+       */
+      cmd_buffer->state.compute.pipeline_dirty = true;
+   }
+#endif
+
+   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+    * PIPELINE_SELECT [DevBWR+]":
+    *
+    *   Project: DEVSNB+
+    *
+    *   Software must ensure all the write caches are flushed through a
+    *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+    *   command to invalidate read only caches prior to programming
+    *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+    *
+    * Note the cmd_buffer_apply_pipe_flushes will split this into two
+    * PIPE_CONTROLs.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                             ANV_PIPE_CS_STALL_BIT |
+                             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+                             ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+                             ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+                             ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+                             ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT,
+                             "flush and invalidate for PIPELINE_SELECT");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
+#if GFX_VER >= 9
+      ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
+      ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
+#endif
+      ps.PipelineSelection = pipeline;
+   }
+
+#if GFX_VER == 9
+   if (devinfo->platform == INTEL_PLATFORM_GLK) {
+      /* Project: DevGLK
+       *
+       * "This chicken bit works around a hardware issue with barrier logic
+       *  encountered when switching between GPGPU and 3D pipelines.  To
+       *  workaround the issue, this mode bit should be set after a pipeline
+       *  is selected."
+       */
+      anv_batch_write_reg(&cmd_buffer->batch, GENX(SLICE_COMMON_ECO_CHICKEN1), scec1) {
+         scec1.GLKBarrierMode = pipeline == GPGPU ? GLK_BARRIER_MODE_GPGPU
+                                                  : GLK_BARRIER_MODE_3D_HULL;
+         scec1.GLKBarrierModeMask = 1;
+      }
+   }
+#endif
+
+   cmd_buffer->state.current_pipeline = pipeline;
+}
+
+void
+genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
+{
+   genX(flush_pipeline_select)(cmd_buffer, _3D);
+}
+
+void
+genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
+{
+   genX(flush_pipeline_select)(cmd_buffer, GPGPU);
+}
+
+void
+genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (GFX_VER >= 8)
+      return;
+
+   /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
+    *
+    *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
+    *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
+    *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
+    *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
+    *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
+    *    Depth Flush Bit set, followed by another pipelined depth stall
+    *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
+    *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
+    *    via a preceding MI_FLUSH)."
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
+      pipe.DepthStallEnable = true;
+      anv_debug_dump_pc(pipe);
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
+      pipe.DepthCacheFlushEnable = true;
+#if GFX_VER >= 12
+      pipe.TileCacheFlushEnable = true;
+#endif
+      anv_debug_dump_pc(pipe);
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
+      pipe.DepthStallEnable = true;
+      anv_debug_dump_pc(pipe);
+   }
+}
+
+void
+genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
+                                     const struct isl_surf *surf)
+{
+#if GFX_VERx10 == 120
+   const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
+                               surf->samples == 1;
+
+   switch (cmd_buffer->state.depth_reg_mode) {
+   case ANV_DEPTH_REG_MODE_HW_DEFAULT:
+      if (!is_d16_1x_msaa)
+         return;
+      break;
+   case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
+      if (is_d16_1x_msaa)
+         return;
+      break;
+   case ANV_DEPTH_REG_MODE_UNKNOWN:
+      break;
+   }
+
+   /* We'll change some CHICKEN registers depending on the depth surface
+    * format. Do a depth flush and stall so the pipeline is not using these
+    * settings while we change the registers.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_DEPTH_STALL_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "Workaround: Stop pipeline for 14010455700");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   /* Wa_14010455700
+    *
+    * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
+    * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
+    */
+   anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
+      reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
+      reg.HIZPlaneOptimizationdisablebitMask = true;
+   }
+
+   cmd_buffer->state.depth_reg_mode =
+      is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
+                       ANV_DEPTH_REG_MODE_HW_DEFAULT;
+#endif
+}
+
+/* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
+ *
+ *    "The VF cache needs to be invalidated before binding and then using
+ *    Vertex Buffers that overlap with any previously bound Vertex Buffer
+ *    (at a 64B granularity) since the last invalidation.  A VF cache
+ *    invalidate is performed by setting the "VF Cache Invalidation Enable"
+ *    bit in PIPE_CONTROL."
+ *
+ * This is implemented by carefully tracking all vertex and index buffer
+ * bindings and flushing if the cache ever ends up with a range in the cache
+ * that would exceed 4 GiB.  This is implemented in three parts:
+ *
+ *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
+ *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
+ *       tracking code of the new binding.  If this new binding would cause
+ *       the cache to have a too-large range on the next draw call, a pipeline
+ *       stall and VF cache invalidate are added to pending_pipeline_bits.
+ *
+ *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
+ *       empty whenever we emit a VF invalidate.
+ *
+ *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
+ *       after every 3DPRIMITIVE and copies the bound range into the dirty
+ *       range for each used buffer.  This has to be a separate step because
+ *       we don't always re-bind all buffers and so 1. can't know which
+ *       buffers are actually bound.
+ */
+void
+genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                               int vb_index,
+                                               struct anv_address vb_address,
+                                               uint32_t vb_size)
+{
+   if (GFX_VER < 8 || GFX_VER > 9 ||
+       anv_use_relocations(cmd_buffer->device->physical))
+      return;
+
+   struct anv_vb_cache_range *bound, *dirty;
+   if (vb_index == -1) {
+      bound = &cmd_buffer->state.gfx.ib_bound_range;
+      dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+   } else {
+      assert(vb_index >= 0);
+      assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+      assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+      bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
+      dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
+   }
+
+   if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
+                                                  vb_address,
+                                                  vb_size)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                                "vb > 32b range");
+   }
+}
+
+void
+genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
+                                                    uint32_t access_type,
+                                                    uint64_t vb_used)
+{
+   if (GFX_VER < 8 || GFX_VER > 9 ||
+       anv_use_relocations(cmd_buffer->device->physical))
+      return;
+
+   if (access_type == RANDOM) {
+      /* We have an index buffer */
+      struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
+      struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
+
+      if (bound->end > bound->start) {
+         dirty->start = MIN2(dirty->start, bound->start);
+         dirty->end = MAX2(dirty->end, bound->end);
+      }
+   }
+
+   uint64_t mask = vb_used;
+   while (mask) {
+      int i = u_bit_scan64(&mask);
+      assert(i >= 0);
+      assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
+      assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
+
+      struct anv_vb_cache_range *bound, *dirty;
+      bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
+      dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
+
+      if (bound->end > bound->start) {
+         dirty->start = MIN2(dirty->start, bound->start);
+         dirty->end = MAX2(dirty->end, bound->end);
+      }
+   }
+}
+
+/**
+ * Update the pixel hashing modes that determine the balancing of PS threads
+ * across subslices and slices.
+ *
+ * \param width Width bound of the rendering area (already scaled down if \p
+ *              scale is greater than 1).
+ * \param height Height bound of the rendering area (already scaled down if \p
+ *               scale is greater than 1).
+ * \param scale The number of framebuffer samples that could potentially be
+ *              affected by an individual channel of the PS thread.  This is
+ *              typically one for single-sampled rendering, but for operations
+ *              like CCS resolves and fast clears a single PS invocation may
+ *              update a huge number of pixels, in which case a finer
+ *              balancing is desirable in order to maximally utilize the
+ *              bandwidth available.  UINT_MAX can be used as shorthand for
+ *              "finest hashing mode available".
+ */
+void
+genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
+                                   unsigned width, unsigned height,
+                                   unsigned scale)
+{
+#if GFX_VER == 9
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   const unsigned slice_hashing[] = {
+      /* Because all Gfx9 platforms with more than one slice require
+       * three-way subslice hashing, a single "normal" 16x16 slice hashing
+       * block is guaranteed to suffer from substantial imbalance, with one
+       * subslice receiving twice as much work as the other two in the
+       * slice.
+       *
+       * The performance impact of that would be particularly severe when
+       * three-way hashing is also in use for slice balancing (which is the
+       * case for all Gfx9 GT4 platforms), because one of the slices
+       * receives one every three 16x16 blocks in either direction, which
+       * is roughly the periodicity of the underlying subslice imbalance
+       * pattern ("roughly" because in reality the hardware's
+       * implementation of three-way hashing doesn't do exact modulo 3
+       * arithmetic, which somewhat decreases the magnitude of this effect
+       * in practice).  This leads to a systematic subslice imbalance
+       * within that slice regardless of the size of the primitive.  The
+       * 32x32 hashing mode guarantees that the subslice imbalance within a
+       * single slice hashing block is minimal, largely eliminating this
+       * effect.
+       */
+      _32x32,
+      /* Finest slice hashing mode available. */
+      NORMAL
+   };
+   const unsigned subslice_hashing[] = {
+      /* 16x16 would provide a slight cache locality benefit especially
+       * visible in the sampler L1 cache efficiency of low-bandwidth
+       * non-LLC platforms, but it comes at the cost of greater subslice
+       * imbalance for primitives of dimensions approximately intermediate
+       * between 16x4 and 16x16.
+       */
+      _16x4,
+      /* Finest subslice hashing mode available. */
+      _8x4
+   };
+   /* Dimensions of the smallest hashing block of a given hashing mode.  If
+    * the rendering area is smaller than this there can't possibly be any
+    * benefit from switching to this mode, so we optimize out the
+    * transition.
+    */
+   const unsigned min_size[][2] = {
+         { 16, 4 },
+         { 8, 4 }
+   };
+   const unsigned idx = scale > 1;
+
+   if (cmd_buffer->state.current_hash_scale != scale &&
+       (width > min_size[idx][0] || height > min_size[idx][1])) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                                "change pixel hash mode");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      anv_batch_write_reg(&cmd_buffer->batch, GENX(GT_MODE), gt) {
+         gt.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
+         gt.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
+         gt.SubsliceHashing = subslice_hashing[idx];
+         gt.SubsliceHashingMask = -1;
+      }
+
+      cmd_buffer->state.current_hash_scale = scale;
+   }
+#endif
+}
+
+static void
+cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   /* FIXME: Width and Height are wrong */
+
+   genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+
+   uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
+                                        device->isl_dev.ds.size / 4);
+   if (dw == NULL)
+      return;
+
+   struct isl_view isl_view = {};
+   struct isl_depth_stencil_hiz_emit_info info = {
+      .view = &isl_view,
+      .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
+   };
+
+   if (gfx->depth_att.iview != NULL) {
+      isl_view = gfx->depth_att.iview->planes[0].isl;
+   } else if (gfx->stencil_att.iview != NULL) {
+      isl_view = gfx->stencil_att.iview->planes[0].isl;
+   }
+
+   if (gfx->view_mask) {
+      assert(isl_view.array_len == 0 ||
+             isl_view.array_len >= util_last_bit(gfx->view_mask));
+      isl_view.array_len = util_last_bit(gfx->view_mask);
+   } else {
+      assert(isl_view.array_len == 0 ||
+             isl_view.array_len >= util_last_bit(gfx->layer_count));
+      isl_view.array_len = gfx->layer_count;
+   }
+
+   if (gfx->depth_att.iview != NULL) {
+      const struct anv_image_view *iview = gfx->depth_att.iview;
+      const struct anv_image *image = iview->image;
+
+      const uint32_t depth_plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
+      const struct anv_surface *depth_surface =
+         &image->planes[depth_plane].primary_surface;
+      const struct anv_address depth_address =
+         anv_image_address(image, &depth_surface->memory_range);
+
+      info.depth_surf = &depth_surface->isl;
+
+      info.depth_address =
+         anv_batch_emit_reloc(&cmd_buffer->batch,
+                              dw + device->isl_dev.ds.depth_offset / 4,
+                              depth_address.bo, depth_address.offset);
+      info.mocs =
+         anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
+
+      info.hiz_usage = gfx->depth_att.aux_usage;
+      if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
+         assert(isl_aux_usage_has_hiz(info.hiz_usage));
+
+         const struct anv_surface *hiz_surface =
+            &image->planes[depth_plane].aux_surface;
+         const struct anv_address hiz_address =
+            anv_image_address(image, &hiz_surface->memory_range);
+
+         info.hiz_surf = &hiz_surface->isl;
+
+         info.hiz_address =
+            anv_batch_emit_reloc(&cmd_buffer->batch,
+                                 dw + device->isl_dev.ds.hiz_offset / 4,
+                                 hiz_address.bo, hiz_address.offset);
+
+         info.depth_clear_value = ANV_HZ_FC_VAL;
+      }
+   }
+
+   if (gfx->stencil_att.iview != NULL) {
+      const struct anv_image_view *iview = gfx->stencil_att.iview;
+      const struct anv_image *image = iview->image;
+
+      const uint32_t stencil_plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+      const struct anv_surface *stencil_surface =
+         &image->planes[stencil_plane].primary_surface;
+      const struct anv_address stencil_address =
+         anv_image_address(image, &stencil_surface->memory_range);
+
+      info.stencil_surf = &stencil_surface->isl;
+
+      info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
+      info.stencil_address =
+         anv_batch_emit_reloc(&cmd_buffer->batch,
+                              dw + device->isl_dev.ds.stencil_offset / 4,
+                              stencil_address.bo, stencil_address.offset);
+      info.mocs =
+         anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
+   }
+
+   isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
+
+   if (info.depth_surf)
+      genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
+
+   if (GFX_VER >= 12) {
+      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      /* Wa_1408224581
+       *
+       * Workaround: Gfx12LP Astep only An additional pipe control with
+       * post-sync = store dword operation would be required.( w/a is to
+       * have an additional pipe control after the stencil state whenever
+       * the surface state bits of this state is changing).
+       *
+       * This also seems sufficient to handle Wa_14014148106.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.PostSyncOperation = WriteImmediateData;
+         pc.Address = cmd_buffer->device->workaround_address;
+      }
+   }
+   cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
+}
+
+static void
+cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                   const struct anv_image_view *fsr_iview)
+{
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
+
+   if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
+      return;
+
+   uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
+                                        device->isl_dev.cpb.size / 4);
+   if (dw == NULL)
+      return;
+
+   struct isl_cpb_emit_info info = { };
+
+   if (fsr_iview) {
+      info.view = &fsr_iview->planes[0].isl;
+      info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
+      info.address =
+         anv_batch_emit_reloc(&cmd_buffer->batch,
+                              dw + device->isl_dev.cpb.offset / 4,
+                              fsr_iview->image->bindings[0].address.bo,
+                              fsr_iview->image->bindings[0].address.offset +
+                              fsr_iview->image->bindings[0].memory_range.offset);
+      info.mocs =
+         anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
+                  ISL_SURF_USAGE_CPB_BIT);
+   }
+
+   isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
+#endif /* GFX_VERx10 >= 125 */
+}
+
+static VkImageLayout
+attachment_initial_layout(const VkRenderingAttachmentInfo *att)
+{
+   const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
+      vk_find_struct_const(att->pNext,
+                           RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
+   if (layout_info != NULL)
+      return layout_info->initialLayout;
+
+   return att->imageLayout;
+}
+
+void genX(CmdBeginRendering)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderingInfo*                      pRenderingInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   VkResult result;
+
+   if (!is_render_queue_cmd_buffer(cmd_buffer)) {
+      assert(!"Trying to start a render pass on non-render queue!");
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
+      return;
+   }
+
+   anv_measure_beginrenderpass(cmd_buffer);
+   trace_intel_begin_render_pass(&cmd_buffer->trace);
+
+   gfx->rendering_flags = pRenderingInfo->flags;
+   gfx->render_area = pRenderingInfo->renderArea;
+   gfx->view_mask = pRenderingInfo->viewMask;
+   gfx->layer_count = pRenderingInfo->layerCount;
+   gfx->samples = 0;
+
+   const bool is_multiview = gfx->view_mask != 0;
+   const VkRect2D render_area = gfx->render_area;
+   const uint32_t layers =
+      is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
+
+   /* The framebuffer size is at least large enough to contain the render
+    * area.  Because a zero renderArea is possible, we MAX with 1.
+    */
+   struct isl_extent3d fb_size = {
+      .w = MAX2(1, render_area.offset.x + render_area.extent.width),
+      .h = MAX2(1, render_area.offset.y + render_area.extent.height),
+      .d = layers,
+   };
+
+   const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
+   result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+   if (result != VK_SUCCESS)
+      return;
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
+         continue;
+
+      const VkRenderingAttachmentInfo *att =
+         &pRenderingInfo->pColorAttachments[i];
+      ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
+      const VkImageLayout initial_layout = attachment_initial_layout(att);
+
+      assert(render_area.offset.x + render_area.extent.width <=
+             iview->vk.extent.width);
+      assert(render_area.offset.y + render_area.extent.height <=
+             iview->vk.extent.height);
+      assert(layers <= iview->vk.layer_count);
+
+      fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
+      fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
+
+      assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
+      gfx->samples |= iview->vk.image->samples;
+
+      enum isl_aux_usage aux_usage =
+         anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                 iview->image,
+                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+                                 att->imageLayout);
+
+      union isl_color_value fast_clear_color = { .u32 = { 0, } };
+
+      if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
+         const union isl_color_value clear_color =
+            vk_to_isl_color_with_format(att->clearValue.color,
+                                        iview->planes[0].isl.format);
+
+         /* We only support fast-clears on the first layer */
+         const bool fast_clear =
+            (!is_multiview || (gfx->view_mask & 1)) &&
+            anv_can_fast_clear_color_view(cmd_buffer->device, iview,
+                                          att->imageLayout, clear_color,
+                                          layers, render_area);
+
+         if (att->imageLayout != initial_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == iview->vk.extent.width &&
+                   render_area.extent.height == iview->vk.extent.height);
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_color_buffer(cmd_buffer, iview->image,
+                                          VK_IMAGE_ASPECT_COLOR_BIT,
+                                          iview->vk.base_mip_level, 1,
+                                          iview->vk.base_array_layer + view,
+                                          1, /* layer_count */
+                                          initial_layout, att->imageLayout,
+                                          VK_QUEUE_FAMILY_IGNORED,
+                                          VK_QUEUE_FAMILY_IGNORED,
+                                          fast_clear);
+               }
+            } else {
+               transition_color_buffer(cmd_buffer, iview->image,
+                                       VK_IMAGE_ASPECT_COLOR_BIT,
+                                       iview->vk.base_mip_level, 1,
+                                       iview->vk.base_array_layer,
+                                       gfx->layer_count,
+                                       initial_layout, att->imageLayout,
+                                       VK_QUEUE_FAMILY_IGNORED,
+                                       VK_QUEUE_FAMILY_IGNORED,
+                                       fast_clear);
+            }
+         }
+
+         uint32_t clear_view_mask = pRenderingInfo->viewMask;
+         uint32_t base_clear_layer = iview->vk.base_array_layer;
+         uint32_t clear_layer_count = gfx->layer_count;
+         if (fast_clear) {
+            /* We only support fast-clears on the first layer */
+            assert(iview->vk.base_mip_level == 0 &&
+                   iview->vk.base_array_layer == 0);
+
+            fast_clear_color = clear_color;
+
+            if (iview->image->vk.samples == 1) {
+               anv_image_ccs_op(cmd_buffer, iview->image,
+                                iview->planes[0].isl.format,
+                                iview->planes[0].isl.swizzle,
+                                VK_IMAGE_ASPECT_COLOR_BIT,
+                                0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
+                                &fast_clear_color,
+                                false);
+            } else {
+               anv_image_mcs_op(cmd_buffer, iview->image,
+                                iview->planes[0].isl.format,
+                                iview->planes[0].isl.swizzle,
+                                VK_IMAGE_ASPECT_COLOR_BIT,
+                                0, 1, ISL_AUX_OP_FAST_CLEAR,
+                                &fast_clear_color,
+                                false);
+            }
+            clear_view_mask &= ~1u;
+            base_clear_layer++;
+            clear_layer_count--;
+
+            if (isl_color_value_is_zero(clear_color,
+                                        iview->planes[0].isl.format)) {
+               /* This image has the auxiliary buffer enabled. We can mark the
+                * subresource as not needing a resolve because the clear color
+                * will match what's in every RENDER_SURFACE_STATE object when
+                * it's being used for sampling.
+                */
+               set_image_fast_clear_state(cmd_buffer, iview->image,
+                                          VK_IMAGE_ASPECT_COLOR_BIT,
+                                          ANV_FAST_CLEAR_DEFAULT_VALUE);
+            } else {
+               set_image_fast_clear_state(cmd_buffer, iview->image,
+                                          VK_IMAGE_ASPECT_COLOR_BIT,
+                                          ANV_FAST_CLEAR_ANY);
+            }
+         }
+
+         if (is_multiview) {
+            u_foreach_bit(view, clear_view_mask) {
+               anv_image_clear_color(cmd_buffer, iview->image,
+                                     VK_IMAGE_ASPECT_COLOR_BIT,
+                                     aux_usage,
+                                     iview->planes[0].isl.format,
+                                     iview->planes[0].isl.swizzle,
+                                     iview->vk.base_mip_level,
+                                     iview->vk.base_array_layer + view, 1,
+                                     render_area, clear_color);
+            }
+         } else {
+            anv_image_clear_color(cmd_buffer, iview->image,
+                                  VK_IMAGE_ASPECT_COLOR_BIT,
+                                  aux_usage,
+                                  iview->planes[0].isl.format,
+                                  iview->planes[0].isl.swizzle,
+                                  iview->vk.base_mip_level,
+                                  base_clear_layer, clear_layer_count,
+                                  render_area, clear_color);
+         }
+      } else {
+         /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+         assert(att->imageLayout == initial_layout);
+      }
+
+      gfx->color_att[i].vk_format = iview->vk.format;
+      gfx->color_att[i].iview = iview;
+      gfx->color_att[i].layout = att->imageLayout;
+      gfx->color_att[i].aux_usage = aux_usage;
+
+      struct isl_view isl_view = iview->planes[0].isl;
+      if (pRenderingInfo->viewMask) {
+         assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
+         isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
+      } else {
+         assert(isl_view.array_len >= pRenderingInfo->layerCount);
+         isl_view.array_len = pRenderingInfo->layerCount;
+      }
+
+      anv_image_fill_surface_state(cmd_buffer->device,
+                                   iview->image,
+                                   VK_IMAGE_ASPECT_COLOR_BIT,
+                                   &isl_view,
+                                   ISL_SURF_USAGE_RENDER_TARGET_BIT,
+                                   aux_usage, &fast_clear_color,
+                                   0, /* anv_image_view_state_flags */
+                                   &gfx->color_att[i].surface_state,
+                                   NULL);
+
+      add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
+
+      if (GFX_VER < 10 &&
+          (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
+           (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
+          iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
+          iview->planes[0].isl.base_level == 0 &&
+          iview->planes[0].isl.base_array_layer == 0) {
+         genX(copy_fast_clear_dwords)(cmd_buffer,
+                                      gfx->color_att[i].surface_state.state,
+                                      iview->image,
+                                      VK_IMAGE_ASPECT_COLOR_BIT,
+                                      false /* copy to ss */);
+      }
+
+      if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+         gfx->color_att[i].resolve_mode = att->resolveMode;
+         gfx->color_att[i].resolve_iview =
+            anv_image_view_from_handle(att->resolveImageView);
+         gfx->color_att[i].resolve_layout = att->resolveImageLayout;
+      }
+   }
+
+   const struct anv_image_view *fsr_iview = NULL;
+   const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
+      vk_find_struct_const(pRenderingInfo->pNext,
+                           RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
+   if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
+      fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
+      /* imageLayout and shadingRateAttachmentTexelSize are ignored */
+   }
+
+   const struct anv_image_view *ds_iview = NULL;
+   const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
+   const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
+   if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
+       (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
+      const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
+      VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
+      enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
+      float depth_clear_value = 0;
+      uint32_t stencil_clear_value = 0;
+
+      if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
+         d_iview = anv_image_view_from_handle(d_att->imageView);
+         initial_depth_layout = attachment_initial_layout(d_att);
+         depth_layout = d_att->imageLayout;
+         depth_aux_usage =
+            anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                    d_iview->image,
+                                    VK_IMAGE_ASPECT_DEPTH_BIT,
+                                    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                                    depth_layout);
+         depth_clear_value = d_att->clearValue.depthStencil.depth;
+      }
+
+      if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
+         s_iview = anv_image_view_from_handle(s_att->imageView);
+         initial_stencil_layout = attachment_initial_layout(s_att);
+         stencil_layout = s_att->imageLayout;
+         stencil_aux_usage =
+            anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                    s_iview->image,
+                                    VK_IMAGE_ASPECT_STENCIL_BIT,
+                                    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                                    stencil_layout);
+         stencil_clear_value = s_att->clearValue.depthStencil.stencil;
+      }
+
+      assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
+      ds_iview = d_iview != NULL ? d_iview : s_iview;
+      assert(ds_iview != NULL);
+
+      assert(render_area.offset.x + render_area.extent.width <=
+             ds_iview->vk.extent.width);
+      assert(render_area.offset.y + render_area.extent.height <=
+             ds_iview->vk.extent.height);
+      assert(layers <= ds_iview->vk.layer_count);
+
+      fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
+      fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
+
+      assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
+      gfx->samples |= ds_iview->vk.image->samples;
+
+      VkImageAspectFlags clear_aspects = 0;
+      if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+         clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+         clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      if (clear_aspects != 0) {
+         const bool hiz_clear =
+            anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
+                                      depth_layout, clear_aspects,
+                                      depth_clear_value,
+                                      render_area);
+
+         if (depth_layout != initial_depth_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == d_iview->vk.extent.width &&
+                   render_area.extent.height == d_iview->vk.extent.height);
+
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_depth_buffer(cmd_buffer, d_iview->image,
+                                          d_iview->vk.base_array_layer + view,
+                                          1 /* layer_count */,
+                                          initial_depth_layout, depth_layout,
+                                          hiz_clear);
+               }
+            } else {
+               transition_depth_buffer(cmd_buffer, d_iview->image,
+                                       d_iview->vk.base_array_layer,
+                                       gfx->layer_count,
+                                       initial_depth_layout, depth_layout,
+                                       hiz_clear);
+            }
+         }
+
+         if (stencil_layout != initial_stencil_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == s_iview->vk.extent.width &&
+                   render_area.extent.height == s_iview->vk.extent.height);
+
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_stencil_buffer(cmd_buffer, s_iview->image,
+                                            s_iview->vk.base_mip_level, 1,
+                                            s_iview->vk.base_array_layer + view,
+                                            1 /* layer_count */,
+                                            initial_stencil_layout,
+                                            stencil_layout,
+                                            hiz_clear);
+               }
+            } else {
+               transition_stencil_buffer(cmd_buffer, s_iview->image,
+                                         s_iview->vk.base_mip_level, 1,
+                                         s_iview->vk.base_array_layer,
+                                         gfx->layer_count,
+                                         initial_stencil_layout,
+                                         stencil_layout,
+                                         hiz_clear);
+            }
+         }
+
+         if (is_multiview) {
+            uint32_t clear_view_mask = pRenderingInfo->viewMask;
+            while (clear_view_mask) {
+               int view = u_bit_scan(&clear_view_mask);
+
+               uint32_t level = ds_iview->vk.base_mip_level;
+               uint32_t layer = ds_iview->vk.base_array_layer + view;
+
+               if (hiz_clear) {
+                  anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+                                      clear_aspects,
+                                      level, layer, 1,
+                                      render_area,
+                                      stencil_clear_value);
+               } else {
+                  anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+                                                clear_aspects,
+                                                depth_aux_usage,
+                                                level, layer, 1,
+                                                render_area,
+                                                depth_clear_value,
+                                                stencil_clear_value);
+               }
+            }
+         } else {
+            uint32_t level = ds_iview->vk.base_mip_level;
+            uint32_t base_layer = ds_iview->vk.base_array_layer;
+            uint32_t layer_count = gfx->layer_count;
+
+            if (hiz_clear) {
+               anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+                                   clear_aspects,
+                                   level, base_layer, layer_count,
+                                   render_area,
+                                   stencil_clear_value);
+            } else {
+               anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+                                             clear_aspects,
+                                             depth_aux_usage,
+                                             level, base_layer, layer_count,
+                                             render_area,
+                                             depth_clear_value,
+                                             stencil_clear_value);
+            }
+         }
+      } else {
+         /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+         assert(depth_layout == initial_depth_layout);
+         assert(stencil_layout == initial_stencil_layout);
+      }
+
+      if (d_iview != NULL) {
+         gfx->depth_att.vk_format = d_iview->vk.format;
+         gfx->depth_att.iview = d_iview;
+         gfx->depth_att.layout = depth_layout;
+         gfx->depth_att.aux_usage = depth_aux_usage;
+         if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+            assert(d_att->resolveImageView != VK_NULL_HANDLE);
+            gfx->depth_att.resolve_mode = d_att->resolveMode;
+            gfx->depth_att.resolve_iview =
+               anv_image_view_from_handle(d_att->resolveImageView);
+            gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
+         }
+      }
+
+      if (s_iview != NULL) {
+         gfx->stencil_att.vk_format = s_iview->vk.format;
+         gfx->stencil_att.iview = s_iview;
+         gfx->stencil_att.layout = stencil_layout;
+         gfx->stencil_att.aux_usage = stencil_aux_usage;
+         if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+            assert(s_att->resolveImageView != VK_NULL_HANDLE);
+            gfx->stencil_att.resolve_mode = s_att->resolveMode;
+            gfx->stencil_att.resolve_iview =
+               anv_image_view_from_handle(s_att->resolveImageView);
+            gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
+         }
+      }
+   }
+
+   /* Finally, now that we know the right size, set up the null surface */
+   assert(util_bitcount(gfx->samples) <= 1);
+   isl_null_fill_state(&cmd_buffer->device->isl_dev,
+                       gfx->null_surface_state.map,
+                       .size = fb_size);
+
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
+         continue;
+
+      isl_null_fill_state(&cmd_buffer->device->isl_dev,
+                          gfx->color_att[i].surface_state.state.map,
+                          .size = fb_size);
+   }
+
+   /****** We can now start emitting code to begin the render pass ******/
+
+   gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
+
+   /* Our implementation of VK_KHR_multiview uses instancing to draw the
+    * different views.  If the client asks for instancing, we need to use the
+    * Instance Data Step Rate to ensure that we repeat the client's
+    * per-instance data once for each view.  Since this bit is in
+    * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
+    * of each subpass.
+    */
+   if (GFX_VER == 7)
+      gfx->vb_dirty |= ~0;
+
+   /* It is possible to start a render pass with an old pipeline.  Because the
+    * render pass and subpass index are both baked into the pipeline, this is
+    * highly unlikely.  In order to do so, it requires that you have a render
+    * pass with a single subpass and that you use that render pass twice
+    * back-to-back and use the same pipeline at the start of the second render
+    * pass as at the end of the first.  In order to avoid unpredictable issues
+    * with this edge case, we just dirty the pipeline at the start of every
+    * subpass.
+    */
+   gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
+
+#if GFX_VER >= 11
+   /* The PIPE_CONTROL command description says:
+    *
+    *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
+    *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
+    *     Target Cache Flush by enabling this bit. When render target flush
+    *     is set due to new association of BTI, PS Scoreboard Stall bit must
+    *     be set in this packet."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                             "change RT");
+#endif
+
+   cmd_buffer_emit_depth_stencil(cmd_buffer);
+
+   cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
+}
+
+static void
+cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_attachment *att,
+                                   VkImageAspectFlagBits aspect)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_image_view *iview = att->iview;
+
+   if (gfx->view_mask == 0) {
+      genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
+                                          aspect, att->aux_usage,
+                                          iview->planes[0].isl.base_level,
+                                          iview->planes[0].isl.base_array_layer,
+                                          gfx->layer_count);
+   } else {
+      uint32_t res_view_mask = gfx->view_mask;
+      while (res_view_mask) {
+         int i = u_bit_scan(&res_view_mask);
+
+         const uint32_t level = iview->planes[0].isl.base_level;
+         const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
+
+         genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
+                                             aspect, att->aux_usage,
+                                             level, layer, 1);
+      }
+   }
+}
+
+static enum blorp_filter
+vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
+{
+   switch (vk_mode) {
+   case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
+      return BLORP_FILTER_SAMPLE_0;
+   case VK_RESOLVE_MODE_AVERAGE_BIT:
+      return BLORP_FILTER_AVERAGE;
+   case VK_RESOLVE_MODE_MIN_BIT:
+      return BLORP_FILTER_MIN_SAMPLE;
+   case VK_RESOLVE_MODE_MAX_BIT:
+      return BLORP_FILTER_MAX_SAMPLE;
+   default:
+      return BLORP_FILTER_NONE;
+   }
+}
+
+static void
+cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
+                                   const struct anv_attachment *att,
+                                   VkImageLayout layout,
+                                   VkImageAspectFlagBits aspect)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_image_view *src_iview = att->iview;
+   const struct anv_image_view *dst_iview = att->resolve_iview;
+
+   enum isl_aux_usage src_aux_usage =
+      anv_layout_to_aux_usage(cmd_buffer->device->info,
+                              src_iview->image, aspect,
+                              VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                              layout);
+
+   enum isl_aux_usage dst_aux_usage =
+      anv_layout_to_aux_usage(cmd_buffer->device->info,
+                              dst_iview->image, aspect,
+                              VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                              att->resolve_layout);
+
+   enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
+
+   const VkRect2D render_area = gfx->render_area;
+   if (gfx->view_mask == 0) {
+      anv_image_msaa_resolve(cmd_buffer,
+                             src_iview->image, src_aux_usage,
+                             src_iview->planes[0].isl.base_level,
+                             src_iview->planes[0].isl.base_array_layer,
+                             dst_iview->image, dst_aux_usage,
+                             dst_iview->planes[0].isl.base_level,
+                             dst_iview->planes[0].isl.base_array_layer,
+                             aspect,
+                             render_area.offset.x, render_area.offset.y,
+                             render_area.offset.x, render_area.offset.y,
+                             render_area.extent.width,
+                             render_area.extent.height,
+                             gfx->layer_count, filter);
+   } else {
+      uint32_t res_view_mask = gfx->view_mask;
+      while (res_view_mask) {
+         int i = u_bit_scan(&res_view_mask);
+
+         anv_image_msaa_resolve(cmd_buffer,
+                                src_iview->image, src_aux_usage,
+                                src_iview->planes[0].isl.base_level,
+                                src_iview->planes[0].isl.base_array_layer + i,
+                                dst_iview->image, dst_aux_usage,
+                                dst_iview->planes[0].isl.base_level,
+                                dst_iview->planes[0].isl.base_array_layer + i,
+                                aspect,
+                                render_area.offset.x, render_area.offset.y,
+                                render_area.offset.x, render_area.offset.y,
+                                render_area.extent.width,
+                                render_area.extent.height,
+                                1, filter);
+      }
+   }
+}
+
+void genX(CmdEndRendering)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const bool is_multiview = gfx->view_mask != 0;
+   const uint32_t layers =
+      is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
+
+   bool has_color_resolve = false;
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (gfx->color_att[i].iview == NULL)
+         continue;
+
+      cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
+                                         VK_IMAGE_ASPECT_COLOR_BIT);
+
+      /* Stash this off for later */
+      if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
+          !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
+         has_color_resolve = true;
+   }
+
+   if (gfx->depth_att.iview != NULL) {
+      cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
+                                         VK_IMAGE_ASPECT_DEPTH_BIT);
+   }
+
+   if (gfx->stencil_att.iview != NULL) {
+      cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
+                                         VK_IMAGE_ASPECT_STENCIL_BIT);
+   }
+
+   if (has_color_resolve) {
+      /* We are about to do some MSAA resolves.  We need to flush so that the
+       * result of writes to the MSAA color attachments show up in the sampler
+       * when we blit to the single-sampled resolve target.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
+                                "MSAA resolve");
+   }
+
+   if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
+       gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
+      /* We are about to do some MSAA resolves.  We need to flush so that the
+       * result of writes to the MSAA depth attachments show up in the sampler
+       * when we blit to the single-sampled resolve target.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                              ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
+                              "MSAA resolve");
+   }
+
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      const struct anv_attachment *att = &gfx->color_att[i];
+      if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
+          (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
+         continue;
+
+      cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
+                                         VK_IMAGE_ASPECT_COLOR_BIT);
+   }
+
+   if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+       !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+      const struct anv_image_view *src_iview = gfx->depth_att.iview;
+
+      /* MSAA resolves sample from the source attachment.  Transition the
+       * depth attachment first to get rid of any HiZ that we may not be
+       * able to handle.
+       */
+      transition_depth_buffer(cmd_buffer, src_iview->image,
+                              src_iview->planes[0].isl.base_array_layer,
+                              layers,
+                              gfx->depth_att.layout,
+                              VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                              false /* will_full_fast_clear */);
+
+      cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
+                                         VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                                         VK_IMAGE_ASPECT_DEPTH_BIT);
+
+      /* Transition the source back to the original layout.  This seems a bit
+       * inefficient but, since HiZ resolves aren't destructive, going from
+       * less HiZ to more is generally a no-op.
+       */
+      transition_depth_buffer(cmd_buffer, src_iview->image,
+                              src_iview->planes[0].isl.base_array_layer,
+                              layers,
+                              VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                              gfx->depth_att.layout,
+                              false /* will_full_fast_clear */);
+   }
+
+   if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+       !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+      cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
+                                         gfx->stencil_att.layout,
+                                         VK_IMAGE_ASPECT_STENCIL_BIT);
+   }
+
+#if GFX_VER == 7
+   /* On gfx7, we have to store a texturable version of the stencil buffer in
+    * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
+    * forth at strategic points. Stencil writes are only allowed in following
+    * layouts:
+    *
+    *  - VK_IMAGE_LAYOUT_GENERAL
+    *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
+    *  - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT
+    *
+    * For general, we have no nice opportunity to transition so we do the copy
+    * to the shadow unconditionally at the end of the subpass. For transfer
+    * destinations, we can update it as part of the transfer op. For the other
+    * layouts, we delay the copy until a transition into some other layout.
+    */
+   if (gfx->stencil_att.iview != NULL) {
+      const struct anv_image_view *iview = gfx->stencil_att.iview;
+      const struct anv_image *image = iview->image;
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+
+      if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
+          (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
+           gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) {
+         anv_image_copy_to_shadow(cmd_buffer, image,
+                                  VK_IMAGE_ASPECT_STENCIL_BIT,
+                                  iview->planes[plane].isl.base_level, 1,
+                                  iview->planes[plane].isl.base_array_layer,
+                                  layers);
+      }
+   }
+#endif
+
+   anv_cmd_buffer_reset_rendering(cmd_buffer);
+}
+
+void
+genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 75
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
+                mi_reg32(ANV_PREDICATE_RESULT_REG));
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOADINV;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+#endif
+}
+
+#if GFX_VERx10 >= 75
+void genX(CmdBeginConditionalRenderingEXT)(
+   VkCommandBuffer                             commandBuffer,
+   const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_address value_address =
+      anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
+
+   const bool isInverted = pConditionalRenderingBegin->flags &
+                           VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
+
+   cmd_state->conditional_render_enabled = true;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   /* Section 19.4 of the Vulkan 1.1.85 spec says:
+    *
+    *    If the value of the predicate in buffer memory changes
+    *    while conditional rendering is active, the rendering commands
+    *    may be discarded in an implementation-dependent way.
+    *    Some implementations may latch the value of the predicate
+    *    upon beginning conditional rendering while others
+    *    may read it before every rendering command.
+    *
+    * So it's perfectly fine to read a value from the buffer once.
+    */
+   struct mi_value value =  mi_mem32(value_address);
+
+   /* Precompute predicate result, it is necessary to support secondary
+    * command buffers since it is unknown if conditional rendering is
+    * inverted when populating them.
+    */
+   mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
+                isInverted ? mi_uge(&b, mi_imm(0), value) :
+                             mi_ult(&b, mi_imm(0), value));
+}
+
+void genX(CmdEndConditionalRenderingEXT)(
+	VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+   cmd_state->conditional_render_enabled = false;
+}
+#endif
+
+/* Set of stage bits for which are pipelined, i.e. they get queued
+ * by the command streamer for later execution.
+ */
+#define ANV_PIPELINE_STAGE_PIPELINED_BITS \
+   ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
+     VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
+     VK_PIPELINE_STAGE_2_HOST_BIT | \
+     VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
+
+void genX(CmdSetEvent2)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    const VkDependencyInfo*                     pDependencyInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   VkPipelineStageFlags2 src_stages = 0;
+
+   for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
+   for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
+   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+         pc.StallAtPixelScoreboard = true;
+         pc.CommandStreamerStallEnable = true;
+      }
+
+      pc.DestinationAddressType  = DAT_PPGTT,
+      pc.PostSyncOperation       = WriteImmediateData,
+      pc.Address = (struct anv_address) {
+         cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         event->state.offset
+      };
+      pc.ImmediateData           = VK_EVENT_SET;
+      anv_debug_dump_pc(pc);
+   }
+}
+
+void genX(CmdResetEvent2)(
+    VkCommandBuffer                             commandBuffer,
+    VkEvent                                     _event,
+    VkPipelineStageFlags2                       stageMask)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_event, event, _event);
+
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+         pc.StallAtPixelScoreboard = true;
+         pc.CommandStreamerStallEnable = true;
+      }
+
+      pc.DestinationAddressType  = DAT_PPGTT;
+      pc.PostSyncOperation       = WriteImmediateData;
+      pc.Address = (struct anv_address) {
+         cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+         event->state.offset
+      };
+      pc.ImmediateData           = VK_EVENT_RESET;
+      anv_debug_dump_pc(pc);
+   }
+}
+
+void genX(CmdWaitEvents2)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    eventCount,
+    const VkEvent*                              pEvents,
+    const VkDependencyInfo*                     pDependencyInfos)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+#if GFX_VER >= 8
+   for (uint32_t i = 0; i < eventCount; i++) {
+      ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+         sem.WaitMode            = PollingMode,
+         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
+         sem.SemaphoreDataDword  = VK_EVENT_SET,
+         sem.SemaphoreAddress = (struct anv_address) {
+            cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+            event->state.offset
+         };
+      }
+   }
+#else
+   anv_finishme("Implement events on gfx7");
+#endif
+
+   cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
+}
+
+static uint32_t vk_to_intel_index_type(VkIndexType type)
+{
+   switch (type) {
+   case VK_INDEX_TYPE_UINT8_EXT:
+      return INDEX_BYTE;
+   case VK_INDEX_TYPE_UINT16:
+      return INDEX_WORD;
+   case VK_INDEX_TYPE_UINT32:
+      return INDEX_DWORD;
+   default:
+      unreachable("invalid index type");
+   }
+}
+
+static uint32_t restart_index_for_type(VkIndexType type)
+{
+   switch (type) {
+   case VK_INDEX_TYPE_UINT8_EXT:
+      return UINT8_MAX;
+   case VK_INDEX_TYPE_UINT16:
+      return UINT16_MAX;
+   case VK_INDEX_TYPE_UINT32:
+      return UINT32_MAX;
+   default:
+      unreachable("invalid index type");
+   }
+}
+
+void genX(CmdBindIndexBuffer)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkIndexType                                 indexType)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   cmd_buffer->state.gfx.restart_index = restart_index_for_type(indexType);
+   cmd_buffer->state.gfx.index_buffer = buffer;
+   cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
+   cmd_buffer->state.gfx.index_offset = offset;
+
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+}
+
+VkResult genX(CmdSetPerformanceOverrideINTEL)(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   switch (pOverrideInfo->type) {
+   case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
+#if GFX_VER >= 9
+      anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
+         csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
+         csdm2.MediaInstructionDisable = pOverrideInfo->enable;
+         csdm2._3DRenderingInstructionDisableMask = true;
+         csdm2.MediaInstructionDisableMask = true;
+      }
+#else
+      anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
+         instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
+         instpm.MediaInstructionDisable = pOverrideInfo->enable;
+         instpm._3DRenderingInstructionDisableMask = true;
+         instpm.MediaInstructionDisableMask = true;
+      }
+#endif
+      break;
+   }
+
+   case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
+      if (pOverrideInfo->enable) {
+         /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_FLUSH_BITS |
+                                   ANV_PIPE_INVALIDATE_BITS,
+                                   "perf counter isolation");
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      }
+      break;
+
+   default:
+      unreachable("Invalid override");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
+{
+   /* TODO: Waiting on the register to write, might depend on generation. */
+
+   return VK_SUCCESS;
+}
+
+#define TIMESTAMP 0x2358
+
+void genX(cmd_emit_timestamp)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              struct anv_address addr,
+                              bool end_of_pipe) {
+   if (end_of_pipe) {
+      anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+         pc.PostSyncOperation   = WriteTimestamp;
+         pc.Address             = addr;
+         anv_debug_dump_pc(pc);
+      }
+   } else {
+      struct mi_builder b;
+      mi_builder_init(&b, device->info, batch);
+      mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
+   }
+}
diff --git a/src/intel/vulkan_hasvk/genX_gpu_memcpy.c b/src/intel/vulkan_hasvk/genX_gpu_memcpy.c
new file mode 100644
index 00000000000..3468137b0a8
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_gpu_memcpy.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "common/intel_l3_config.h"
+
+/**
+ * This file implements some lightweight memcpy/memset operations on the GPU
+ * using a vertex buffer and streamout.
+ */
+
+/**
+ * Returns the greatest common divisor of a and b that is a power of two.
+ */
+static uint64_t
+gcd_pow2_u64(uint64_t a, uint64_t b)
+{
+   assert(a > 0 || b > 0);
+
+   unsigned a_log2 = ffsll(a) - 1;
+   unsigned b_log2 = ffsll(b) - 1;
+
+   /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which
+    * case, the MIN2() will take the other one.  If both are 0 then we will
+    * hit the assert above.
+    */
+   return 1 << MIN2(a_log2, b_log2);
+}
+
+static void
+emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+                      const struct intel_l3_config *l3_config)
+{
+#if GFX_VER >= 8
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable = false;
+      vfi.VertexElementIndex = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+#endif
+
+   /* Disable all shader stages */
+   anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+   anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+   anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+   anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+   anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+   anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
+
+   anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = 1;
+      sbe.VertexURBEntryReadLength = 1;
+#if GFX_VER >= 8
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+#endif
+
+#if GFX_VER >= 9
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+#endif
+   }
+
+   /* Emit URB setup.  We tell it that the VS is active because we want it to
+    * allocate space for the VS.  Even though one isn't run, we need VUEs to
+    * store the data that VF is going to pass to SOL.
+    */
+   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+
+   genX(emit_urb_setup)(device, batch, l3_config,
+                        VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL);
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+#if GFX_VER >= 8
+   anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+      vf.StatisticsEnable = false;
+   }
+}
+
+static void
+emit_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+               struct anv_address dst, struct anv_address src,
+               uint32_t size)
+{
+   /* The maximum copy block size is 4 32-bit components at a time. */
+   assert(size % 4 == 0);
+   unsigned bs = gcd_pow2_u64(16, size);
+
+   enum isl_format format;
+   switch (bs) {
+   case 4:  format = ISL_FORMAT_R32_UINT;          break;
+   case 8:  format = ISL_FORMAT_R32G32_UINT;       break;
+   case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break;
+   default:
+      unreachable("Invalid size");
+   }
+
+   uint32_t *dw;
+   dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
+   GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
+      &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex = 32, /* Reserved for this */
+         .AddressModifyEnable = true,
+         .BufferStartingAddress = src,
+         .BufferPitch = bs,
+         .MOCS = anv_mocs(device, src.bo, 0),
+#if GFX_VER >= 12
+         .L3BypassDisable = true,
+#endif
+#if (GFX_VER >= 8)
+         .BufferSize = size,
+#else
+         .EndAddress = anv_address_add(src, size - 1),
+#endif
+      });
+
+   dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
+   GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
+      &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 32,
+         .Valid = true,
+         .SourceElementFormat = format,
+         .SourceElementOffset = 0,
+         .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
+         .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
+         .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
+         .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
+      });
+
+
+   anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
+#if GFX_VER < 12
+      sob.SOBufferIndex = 0;
+#else
+      sob._3DCommandOpcode = 0;
+      sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
+#endif
+      sob.MOCS = anv_mocs(device, dst.bo, 0),
+      sob.SurfaceBaseAddress = dst;
+
+#if GFX_VER >= 8
+      sob.SOBufferEnable = true;
+      sob.SurfaceSize = size / 4 - 1;
+#else
+      sob.SurfacePitch = bs;
+      sob.SurfaceEndAddress = anv_address_add(dst, size);
+#endif
+
+#if GFX_VER >= 8
+      /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
+       * the end position of the stream.  We need to reset this value to 0 at
+       * the beginning of the run or else SOL will start at the offset from
+       * the previous draw.
+       */
+      sob.StreamOffsetWriteEnable = true;
+      sob.StreamOffset = 0;
+#endif
+   }
+
+#if GFX_VER <= 7
+   /* The hardware can do this for us on BDW+ (see above) */
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), load) {
+      load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num);
+      load.DataDWord = 0;
+   }
+#endif
+
+   dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
+                        .StreamtoBufferSelects0 = (1 << 0),
+                        .NumEntries0 = 1);
+   GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
+      &(struct GENX(SO_DECL_ENTRY)) {
+         .Stream0Decl = {
+            .OutputBufferSlot = 0,
+            .RegisterIndex = 0,
+            .ComponentMask = (1 << (bs / 4)) - 1,
+         },
+      });
+
+   anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
+      so.SOFunctionEnable = true;
+      so.RenderingDisable = true;
+      so.Stream0VertexReadOffset = 0;
+      so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
+#if GFX_VER >= 8
+      so.Buffer0SurfacePitch = bs;
+#else
+      so.SOBufferEnable0 = true;
+#endif
+   }
+
+   anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.PrimitiveTopologyType    = _3DPRIM_POINTLIST;
+      prim.VertexCountPerInstance   = size / bs;
+      prim.StartVertexLocation      = 0;
+      prim.InstanceCount            = 1;
+      prim.StartInstanceLocation    = 0;
+      prim.BaseVertexLocation       = 0;
+   }
+}
+
+void
+genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+                          struct anv_device *device,
+                          struct anv_batch *batch)
+{
+   memset(state, 0, sizeof(*state));
+
+   state->batch = batch;
+   state->device = device;
+
+   const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+   genX(emit_l3_config)(batch, device, cfg);
+
+   anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
+#if GFX_VER >= 9
+      ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
+      ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
+#endif
+      ps.PipelineSelection = _3D;
+   }
+
+   emit_common_so_memcpy(batch, device, cfg);
+}
+
+void
+genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
+{
+   genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+                                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
+
+   anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
+
+   if ((state->batch->next - state->batch->start) & 4)
+      anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
+}
+
+void
+genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+                     struct anv_address dst, struct anv_address src,
+                     uint32_t size)
+{
+   if (GFX_VER >= 8 && GFX_VER <= 9 &&
+       !anv_use_relocations(state->device->physical) &&
+       anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
+                                                  &state->vb_dirty,
+                                                  src, size)) {
+      genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+                                    ANV_PIPE_CS_STALL_BIT |
+                                    ANV_PIPE_VF_CACHE_INVALIDATE_BIT);
+      memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
+   }
+
+   emit_so_memcpy(state->batch, state->device, dst, src, size);
+}
+
+void
+genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address dst, struct anv_address src,
+                           uint32_t size)
+{
+   if (size == 0)
+      return;
+
+   if (!cmd_buffer->state.current_l3_config) {
+      const struct intel_l3_config *cfg =
+         intel_get_default_l3_config(cmd_buffer->device->info);
+      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+   }
+
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device,
+                         cmd_buffer->state.current_l3_config);
+   emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size);
+
+   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
+                                                       1ull << 32);
+
+   /* Invalidate pipeline & raster discard since we touch
+    * 3DSTATE_STREAMOUT.
+    */
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+   BITSET_SET(cmd_buffer->vk.dynamic_graphics_state.dirty,
+              MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
+}
diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c
new file mode 100644
index 00000000000..a28f34a0efa
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_pipeline.c
@@ -0,0 +1,2563 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/gen_rt_pack.h"
+
+#include "common/intel_l3_config.h"
+#include "common/intel_sample_positions.h"
+#include "nir/nir_xfb_info.h"
+#include "vk_util.h"
+#include "vk_format.h"
+#include "vk_log.h"
+#include "vk_render_pass.h"
+
+static uint32_t
+vertex_element_comp_control(enum isl_format format, unsigned comp)
+{
+   uint8_t bits;
+   switch (comp) {
+   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
+   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
+   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
+   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
+   default: unreachable("Invalid component");
+   }
+
+   /*
+    * Take in account hardware restrictions when dealing with 64-bit floats.
+    *
+    * From Broadwell spec, command reference structures, page 586:
+    *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
+    *   64-bit components are stored * in the URB without any conversion. In
+    *   this case, vertex elements must be written as 128 or 256 bits, with
+    *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
+    *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
+    *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
+    *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
+    *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
+    *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
+    *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
+    *   256-bit vertex element."
+    */
+   if (bits) {
+      return VFCOMP_STORE_SRC;
+   } else if (comp >= 2 &&
+              !isl_format_layouts[format].channels.b.bits &&
+              isl_format_layouts[format].channels.r.type == ISL_RAW) {
+      /* When emitting 64-bit attributes, we need to write either 128 or 256
+       * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
+       * VFCOMP_STORE_0 to pad the written chunk */
+      return VFCOMP_NOSTORE;
+   } else if (comp < 3 ||
+              isl_format_layouts[format].channels.r.type == ISL_RAW) {
+      /* Note we need to pad with value 0, not 1, due hardware restrictions
+       * (see comment above) */
+      return VFCOMP_STORE_0;
+   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
+            isl_format_layouts[format].channels.r.type == ISL_SINT) {
+      assert(comp == 3);
+      return VFCOMP_STORE_1_INT;
+   } else {
+      assert(comp == 3);
+      return VFCOMP_STORE_1_FP;
+   }
+}
+
+static void
+emit_vertex_input(struct anv_graphics_pipeline *pipeline,
+                  const struct vk_vertex_input_state *vi)
+{
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   /* Pull inputs_read out of the VS prog data */
+   const uint64_t inputs_read = vs_prog_data->inputs_read;
+   const uint64_t double_inputs_read =
+      vs_prog_data->double_inputs_read & inputs_read;
+   assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
+   const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
+   const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
+                                vs_prog_data->uses_instanceid ||
+                                vs_prog_data->uses_firstvertex ||
+                                vs_prog_data->uses_baseinstance;
+
+   uint32_t elem_count = __builtin_popcount(elements) -
+      __builtin_popcount(elements_double) / 2;
+
+   const uint32_t total_elems =
+      MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
+
+   uint32_t *p;
+
+   const uint32_t num_dwords = 1 + total_elems * 2;
+   p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
+                       GENX(3DSTATE_VERTEX_ELEMENTS));
+   if (!p)
+      return;
+
+   for (uint32_t i = 0; i < total_elems; i++) {
+      /* The SKL docs for VERTEX_ELEMENT_STATE say:
+       *
+       *    "All elements must be valid from Element[0] to the last valid
+       *    element. (I.e. if Element[2] is valid then Element[1] and
+       *    Element[0] must also be valid)."
+       *
+       * The SKL docs for 3D_Vertex_Component_Control say:
+       *
+       *    "Don't store this component. (Not valid for Component 0, but can
+       *    be used for Component 1-3)."
+       *
+       * So we can't just leave a vertex element blank and hope for the best.
+       * We have to tell the VF hardware to put something in it; so we just
+       * store a bunch of zero.
+       *
+       * TODO: Compact vertex elements so we never end up with holes.
+       */
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .Valid = true,
+         .Component0Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
+   }
+
+   u_foreach_bit(a, vi->attributes_valid) {
+      enum isl_format format = anv_get_isl_format(pipeline->base.device->info,
+                                                  vi->attributes[a].format,
+                                                  VK_IMAGE_ASPECT_COLOR_BIT,
+                                                  VK_IMAGE_TILING_LINEAR);
+
+      uint32_t binding = vi->attributes[a].binding;
+      assert(binding < MAX_VBS);
+
+      if ((elements & (1 << a)) == 0)
+         continue; /* Binding unused */
+
+      uint32_t slot =
+         __builtin_popcount(elements & ((1 << a) - 1)) -
+         DIV_ROUND_UP(__builtin_popcount(elements_double &
+                                        ((1 << a) -1)), 2);
+
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .VertexBufferIndex = vi->attributes[a].binding,
+         .Valid = true,
+         .SourceElementFormat = format,
+         .EdgeFlagEnable = false,
+         .SourceElementOffset = vi->attributes[a].offset,
+         .Component0Control = vertex_element_comp_control(format, 0),
+         .Component1Control = vertex_element_comp_control(format, 1),
+         .Component2Control = vertex_element_comp_control(format, 2),
+         .Component3Control = vertex_element_comp_control(format, 3),
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
+
+#if GFX_VER >= 8
+      /* On Broadwell and later, we have a separate VF_INSTANCING packet
+       * that controls instancing.  On Haswell and prior, that's part of
+       * VERTEX_BUFFER_STATE which we emit later.
+       */
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         bool per_instance = pipeline->vb[binding].instanced;
+         uint32_t divisor = pipeline->vb[binding].instance_divisor *
+                            pipeline->instance_multiplier;
+
+         vfi.InstancingEnable = per_instance;
+         vfi.VertexElementIndex = slot;
+         vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+      }
+#endif
+   }
+
+   const uint32_t id_slot = elem_count;
+   if (needs_svgs_elem) {
+      /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
+       *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
+       *    Control field is set to something other than VFCOMP_STORE_SRC,
+       *    no higher-numbered Component Control fields may be set to
+       *    VFCOMP_STORE_SRC"
+       *
+       * This means, that if we have BaseInstance, we need BaseVertex as
+       * well.  Just do all or nothing.
+       */
+      uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
+                            vs_prog_data->uses_baseinstance) ?
+                           VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .VertexBufferIndex = ANV_SVGS_VB_INDEX,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
+         .Component0Control = base_ctrl,
+         .Component1Control = base_ctrl,
+#if GFX_VER >= 8
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+#else
+         .Component2Control = VFCOMP_STORE_VID,
+         .Component3Control = VFCOMP_STORE_IID,
+#endif
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
+
+#if GFX_VER >= 8
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         vfi.VertexElementIndex = id_slot;
+      }
+#endif
+   }
+
+#if GFX_VER >= 8
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+      sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
+      sgvs.VertexIDComponentNumber     = 2;
+      sgvs.VertexIDElementOffset       = id_slot;
+      sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
+      sgvs.InstanceIDComponentNumber   = 3;
+      sgvs.InstanceIDElementOffset     = id_slot;
+   }
+#endif
+
+   const uint32_t drawid_slot = elem_count + needs_svgs_elem;
+   if (vs_prog_data->uses_drawid) {
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32_UINT,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+                                      &p[1 + drawid_slot * 2],
+                                      &element);
+
+#if GFX_VER >= 8
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+         vfi.VertexElementIndex = drawid_slot;
+      }
+#endif
+   }
+}
+
+void
+genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
+                     const struct intel_l3_config *l3_config,
+                     VkShaderStageFlags active_stages,
+                     const unsigned entry_size[4],
+                     enum intel_urb_deref_block_size *deref_block_size)
+{
+   const struct intel_device_info *devinfo = device->info;
+
+   unsigned entries[4];
+   unsigned start[4];
+   bool constrained;
+   intel_get_urb_config(devinfo, l3_config,
+                        active_stages &
+                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+                        active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
+                        entry_size, entries, start, deref_block_size,
+                        &constrained);
+
+#if GFX_VERx10 == 70
+   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
+    *
+    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
+    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
+    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
+    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
+    *    needs to be sent before any combination of VS associated 3DSTATE."
+    */
+   anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
+      pc.DepthStallEnable  = true;
+      pc.PostSyncOperation = WriteImmediateData;
+      pc.Address           = device->workaround_address;
+   }
+#endif
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = start[i];
+         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
+         urb.VSNumberofURBEntries      = entries[i];
+      }
+   }
+#if GFX_VERx10 >= 125
+   if (device->physical->vk.supported_extensions.NV_mesh_shader) {
+      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+   }
+#endif
+}
+
+#if GFX_VERx10 >= 125
+static void
+emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
+                    enum intel_urb_deref_block_size *deref_block_size)
+{
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+
+   const struct brw_task_prog_data *task_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
+      get_task_prog_data(pipeline) : NULL;
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+   const struct intel_mesh_urb_allocation alloc =
+      intel_get_mesh_urb_config(devinfo, pipeline->base.l3_config,
+                                task_prog_data ? task_prog_data->map.size_dw : 0,
+                                mesh_prog_data->map.size_dw);
+
+   /* Zero out the primitive pipeline URB allocations. */
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode += i;
+      }
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+      if (task_prog_data) {
+         urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
+         urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
+         urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
+         urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
+         urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
+      }
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+      urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
+      urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
+      urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
+      urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
+      urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
+   }
+
+   *deref_block_size = alloc.deref_block_size;
+}
+#endif
+
+static void
+emit_urb_setup(struct anv_graphics_pipeline *pipeline,
+               enum intel_urb_deref_block_size *deref_block_size)
+{
+#if GFX_VERx10 >= 125
+   if (anv_pipeline_is_mesh(pipeline)) {
+      emit_urb_setup_mesh(pipeline, deref_block_size);
+      return;
+   }
+#endif
+
+   unsigned entry_size[4];
+   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+      const struct brw_vue_prog_data *prog_data =
+         !anv_pipeline_has_stage(pipeline, i) ? NULL :
+         (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
+
+      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
+   }
+
+   genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
+                        pipeline->base.l3_config,
+                        pipeline->active_stages, entry_size,
+                        deref_block_size);
+}
+
+static void
+emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
+#if GFX_VER >= 8
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+#endif
+#if GFX_VERx10 >= 125
+      if (anv_pipeline_is_mesh(pipeline))
+         anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh);
+#endif
+      return;
+   }
+
+   struct GENX(3DSTATE_SBE) sbe = {
+      GENX(3DSTATE_SBE_header),
+      /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
+       * calculate_urb_setup() and related functions.
+       */
+      .AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline),
+      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
+      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
+      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
+   };
+
+#if GFX_VER >= 9
+   for (unsigned i = 0; i < 32; i++)
+      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+#endif
+
+#if GFX_VER >= 8
+   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
+   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
+      GENX(3DSTATE_SBE_SWIZ_header),
+   };
+#else
+#  define swiz sbe
+#endif
+
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const struct brw_vue_map *fs_input_map =
+         &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+      int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+                                                           fs_input_map);
+      assert(first_slot % 2 == 0);
+      unsigned urb_entry_read_offset = first_slot / 2;
+      int max_source_attr = 0;
+      for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
+         uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
+         int input_index = wm_prog_data->urb_setup[attr];
+
+         assert(0 <= input_index);
+
+         /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
+          * VUE header
+          */
+         if (attr == VARYING_SLOT_VIEWPORT ||
+             attr == VARYING_SLOT_LAYER ||
+             attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+            continue;
+         }
+
+         if (attr == VARYING_SLOT_PNTC) {
+            sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+            continue;
+         }
+
+         const int slot = fs_input_map->varying_to_slot[attr];
+
+         if (slot == -1) {
+            /* This attribute does not exist in the VUE--that means that the
+             * vertex shader did not write to it.  It could be that it's a
+             * regular varying read by the fragment shader but not written by
+             * the vertex shader or it's gl_PrimitiveID. In the first case the
+             * value is undefined, in the second it needs to be
+             * gl_PrimitiveID.
+             */
+            swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+            swiz.Attribute[input_index].ComponentOverrideX = true;
+            swiz.Attribute[input_index].ComponentOverrideY = true;
+            swiz.Attribute[input_index].ComponentOverrideZ = true;
+            swiz.Attribute[input_index].ComponentOverrideW = true;
+            continue;
+         }
+
+         /* We have to subtract two slots to account for the URB entry output
+          * read offset in the VS and GS stages.
+          */
+         const int source_attr = slot - 2 * urb_entry_read_offset;
+         assert(source_attr >= 0 && source_attr < 32);
+         max_source_attr = MAX2(max_source_attr, source_attr);
+         /* The hardware can only do overrides on 16 overrides at a time, and the
+          * other up to 16 have to be lined up so that the input index = the
+          * output index. We'll need to do some tweaking to make sure that's the
+          * case.
+          */
+         if (input_index < 16)
+            swiz.Attribute[input_index].SourceAttribute = source_attr;
+         else
+            assert(source_attr == input_index);
+      }
+
+      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+      sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+#if GFX_VER >= 8
+      sbe.ForceVertexURBEntryReadOffset = true;
+      sbe.ForceVertexURBEntryReadLength = true;
+#endif
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
+#if GFX_VERx10 >= 125
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+         const struct brw_mue_map *mue = &mesh_prog_data->map;
+
+         assert(mue->per_vertex_header_size_dw % 8 == 0);
+         sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
+         sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+
+         /* Clip distance array is passed in the per-vertex header so that
+          * it can be consumed by the HW. If user wants to read it in the FS,
+          * adjust the offset and length to cover it. Conveniently it is at
+          * the end of the per-vertex header, right before per-vertex
+          * attributes.
+          *
+          * Note that FS attribute reading must be aware that the clip
+          * distances have fixed position.
+          */
+         if (mue->per_vertex_header_size_dw > 8 &&
+               (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
+            sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+            sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+         }
+
+         assert(mue->per_primitive_header_size_dw % 8 == 0);
+         sbe_mesh.PerPrimitiveURBEntryOutputReadOffset = mue->per_primitive_header_size_dw / 8;
+         sbe_mesh.PerPrimitiveURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+
+         /* Just like with clip distances, if Primitive Shading Rate,
+          * Viewport Index or Layer is read back in the FS, adjust
+          * the offset and length to cover the Primitive Header, where
+          * PSR, Viewport Index & Layer are stored.
+          */
+         if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
+             wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
+             wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0) {
+            assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+         }
+      }
+#endif
+   }
+
+   uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
+                                        GENX(3DSTATE_SBE_length));
+   if (!dw)
+      return;
+   GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
+
+#if GFX_VER >= 8
+   dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
+   if (!dw)
+      return;
+   GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
+#endif
+}
+
+/** Returns the final polygon mode for rasterization
+ *
+ * This function takes into account polygon mode, primitive topology and the
+ * different shader stages which might generate their own type of primitives.
+ */
+VkPolygonMode
+genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+                          VkPrimitiveTopology primitive_topology)
+{
+   if (anv_pipeline_is_mesh(pipeline)) {
+      switch (get_mesh_prog_data(pipeline)->primitive_type) {
+      case SHADER_PRIM_POINTS:
+         return VK_POLYGON_MODE_POINT;
+      case SHADER_PRIM_LINES:
+         return VK_POLYGON_MODE_LINE;
+      case SHADER_PRIM_TRIANGLES:
+         return pipeline->polygon_mode;
+      default:
+         unreachable("invalid primitive type for mesh");
+      }
+   } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+      switch (get_gs_prog_data(pipeline)->output_topology) {
+      case _3DPRIM_POINTLIST:
+         return VK_POLYGON_MODE_POINT;
+
+      case _3DPRIM_LINELIST:
+      case _3DPRIM_LINESTRIP:
+      case _3DPRIM_LINELOOP:
+         return VK_POLYGON_MODE_LINE;
+
+      case _3DPRIM_TRILIST:
+      case _3DPRIM_TRIFAN:
+      case _3DPRIM_TRISTRIP:
+      case _3DPRIM_RECTLIST:
+      case _3DPRIM_QUADLIST:
+      case _3DPRIM_QUADSTRIP:
+      case _3DPRIM_POLYGON:
+         return pipeline->polygon_mode;
+      }
+      unreachable("Unsupported GS output topology");
+   } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+      switch (get_tes_prog_data(pipeline)->output_topology) {
+      case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
+         return VK_POLYGON_MODE_POINT;
+
+      case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
+         return VK_POLYGON_MODE_LINE;
+
+      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
+      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
+         return pipeline->polygon_mode;
+      }
+      unreachable("Unsupported TCS output topology");
+   } else {
+      switch (primitive_topology) {
+      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+         return VK_POLYGON_MODE_POINT;
+
+      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+         return VK_POLYGON_MODE_LINE;
+
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+         return pipeline->polygon_mode;
+
+      default:
+         unreachable("Unsupported primitive topology");
+      }
+   }
+}
+
+uint32_t
+genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
+                            VkPolygonMode raster_mode)
+{
+#if GFX_VER <= 7
+   if (raster_mode == VK_POLYGON_MODE_LINE) {
+      switch (pipeline->line_mode) {
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
+         return MSRASTMODE_ON_PATTERN;
+
+      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
+         return MSRASTMODE_OFF_PIXEL;
+
+      default:
+         unreachable("Unsupported line rasterization mode");
+      }
+   } else {
+      return pipeline->rasterization_samples > 1 ?
+         MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
+   }
+#else
+   unreachable("Only on gen7");
+#endif
+}
+
+const uint32_t genX(vk_to_intel_cullmode)[] = {
+   [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
+   [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
+   [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
+   [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
+};
+
+const uint32_t genX(vk_to_intel_fillmode)[] = {
+   [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
+   [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
+   [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
+};
+
+const uint32_t genX(vk_to_intel_front_face)[] = {
+   [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
+   [VK_FRONT_FACE_CLOCKWISE]                 = 0
+};
+
+void
+genX(rasterization_mode)(VkPolygonMode raster_mode,
+                         VkLineRasterizationModeEXT line_mode,
+                         float line_width,
+                         uint32_t *api_mode,
+                         bool *msaa_rasterization_enable)
+{
+#if GFX_VER >= 8
+   if (raster_mode == VK_POLYGON_MODE_LINE) {
+      /* Unfortunately, configuring our line rasterization hardware on gfx8
+       * and later is rather painful.  Instead of giving us bits to tell the
+       * hardware what line mode to use like we had on gfx7, we now have an
+       * arcane combination of API Mode and MSAA enable bits which do things
+       * in a table which are expected to magically put the hardware into the
+       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
+       * hardware people thought of so nothing works the way you want it to.
+       *
+       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
+       * of the Skylake PRM for more details.
+       */
+      switch (line_mode) {
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
+         *api_mode = DX100;
+#if GFX_VER <= 9
+         /* Prior to ICL, the algorithm the HW uses to draw wide lines
+          * doesn't quite match what the CTS expects, at least for rectangular
+          * lines, so we set this to false here, making it draw parallelograms
+          * instead, which work well enough.
+          */
+         *msaa_rasterization_enable = line_width < 1.0078125;
+#else
+         *msaa_rasterization_enable = true;
+#endif
+         break;
+
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
+      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
+         *api_mode = DX9OGL;
+         *msaa_rasterization_enable = false;
+         break;
+
+      default:
+         unreachable("Unsupported line rasterization mode");
+      }
+   } else {
+      *api_mode = DX100;
+      *msaa_rasterization_enable = true;
+   }
+#else
+   unreachable("Invalid call");
+#endif
+}
+
+static void
+emit_rs_state(struct anv_graphics_pipeline *pipeline,
+              const struct vk_input_assembly_state *ia,
+              const struct vk_rasterization_state *rs,
+              const struct vk_multisample_state *ms,
+              const struct vk_render_pass_state *rp,
+              enum intel_urb_deref_block_size urb_deref_block_size)
+{
+   struct GENX(3DSTATE_SF) sf = {
+      GENX(3DSTATE_SF_header),
+   };
+
+   sf.ViewportTransformEnable = true;
+   sf.StatisticsEnable = true;
+   sf.VertexSubPixelPrecisionSelect = _8Bit;
+   sf.AALineDistanceMode = true;
+
+   switch (rs->provoking_vertex) {
+   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
+      sf.TriangleStripListProvokingVertexSelect = 0;
+      sf.LineStripListProvokingVertexSelect = 0;
+      sf.TriangleFanProvokingVertexSelect = 1;
+      break;
+
+   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
+      sf.TriangleStripListProvokingVertexSelect = 2;
+      sf.LineStripListProvokingVertexSelect = 1;
+      sf.TriangleFanProvokingVertexSelect = 2;
+      break;
+
+   default:
+      unreachable("Invalid provoking vertex mode");
+   }
+
+#if GFX_VERx10 == 75
+   sf.LineStippleEnable = rs->line.stipple.enable;
+#endif
+
+#if GFX_VER >= 12
+   sf.DerefBlockSize = urb_deref_block_size;
+#endif
+
+   bool point_from_shader;
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const struct brw_vue_prog_data *last_vue_prog_data =
+         anv_pipeline_get_last_vue_prog_data(pipeline);
+      point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+   }
+
+   if (point_from_shader) {
+      sf.PointWidthSource = Vertex;
+   } else {
+      sf.PointWidthSource = State;
+      sf.PointWidth = 1.0;
+   }
+
+#if GFX_VER >= 8
+   struct GENX(3DSTATE_RASTER) raster = {
+      GENX(3DSTATE_RASTER_header),
+   };
+#else
+#  define raster sf
+#endif
+
+   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
+    * "Multisample Modes State".
+    */
+#if GFX_VER >= 8
+   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+    * computations.  If we ever set this bit to a different value, they will
+    * need to be updated accordingly.
+    */
+   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
+   raster.ForceMultisampling = false;
+#endif
+
+   raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode];
+   raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs->polygon_mode];
+   raster.ScissorRectangleEnable = true;
+
+#if GFX_VER >= 9
+   /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
+   raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
+   raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
+#elif GFX_VER >= 8
+   raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
+#endif
+
+#if GFX_VER >= 9
+   raster.ConservativeRasterizationEnable =
+      rs->conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
+#endif
+
+#if GFX_VER == 7
+   /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
+    * can get the depth offsets correct.
+    */
+   if (rp != NULL &&
+       rp->depth_attachment_format != VK_FORMAT_UNDEFINED) {
+      assert(vk_format_has_depth(rp->depth_attachment_format));
+      enum isl_format isl_format =
+         anv_get_isl_format(pipeline->base.device->info,
+                            rp->depth_attachment_format,
+                            VK_IMAGE_ASPECT_DEPTH_BIT,
+                            VK_IMAGE_TILING_OPTIMAL);
+      sf.DepthBufferSurfaceFormat =
+         isl_format_get_depth_format(isl_format, false);
+   }
+#endif
+
+#if GFX_VER >= 8
+   GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
+   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
+#else
+#  undef raster
+   GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
+#endif
+}
+
+static void
+emit_ms_state(struct anv_graphics_pipeline *pipeline,
+              const struct vk_multisample_state *ms)
+{
+#if GFX_VER >= 8
+   /* On Gfx8+ 3DSTATE_MULTISAMPLE only holds the number of samples. */
+   genX(emit_multisample)(&pipeline->base.batch,
+                          pipeline->rasterization_samples,
+                          NULL);
+#endif
+
+   /* From the Vulkan 1.0 spec:
+    *    If pSampleMask is NULL, it is treated as if the mask has all bits
+    *    enabled, i.e. no coverage is removed from fragments.
+    *
+    * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
+    */
+#if GFX_VER >= 8
+   uint32_t sample_mask = 0xffff;
+#else
+   uint32_t sample_mask = 0xff;
+#endif
+
+   if (ms != NULL)
+      sample_mask &= ms->sample_mask;
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = sample_mask;
+   }
+}
+
+const uint32_t genX(vk_to_intel_logic_op)[] = {
+   [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
+   [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
+   [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
+   [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
+   [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
+   [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
+   [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
+   [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
+   [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
+   [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
+   [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
+   [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
+   [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
+   [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
+   [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
+   [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
+};
+
+static const uint32_t vk_to_intel_blend[] = {
+   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
+   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
+   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
+   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
+   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
+   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
+   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
+   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
+   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
+   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
+   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
+};
+
+static const uint32_t vk_to_intel_blend_op[] = {
+   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
+   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
+   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
+   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
+   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
+};
+
+const uint32_t genX(vk_to_intel_compare_op)[] = {
+   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
+   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
+   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
+   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
+   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
+   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
+};
+
+const uint32_t genX(vk_to_intel_stencil_op)[] = {
+   [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
+   [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
+   [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
+   [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
+   [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
+   [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
+   [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
+   [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
+};
+
+const uint32_t genX(vk_to_intel_primitive_type)[] = {
+   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
+   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+static bool
+is_dual_src_blend_factor(VkBlendFactor factor)
+{
+   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
+          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
+          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
+          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
+}
+
+static inline uint32_t *
+write_disabled_blend(uint32_t *state)
+{
+   struct GENX(BLEND_STATE_ENTRY) entry = {
+      .WriteDisableAlpha = true,
+      .WriteDisableRed = true,
+      .WriteDisableGreen = true,
+      .WriteDisableBlue = true,
+   };
+   GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
+   return state + GENX(BLEND_STATE_ENTRY_length);
+}
+
+static void
+emit_cb_state(struct anv_graphics_pipeline *pipeline,
+              const struct vk_color_blend_state *cb,
+              const struct vk_multisample_state *ms)
+{
+   struct anv_device *device = pipeline->base.device;
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   struct GENX(BLEND_STATE) blend_state = {
+#if GFX_VER >= 8
+      .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable,
+      .AlphaToOneEnable = ms && ms->alpha_to_one_enable,
+#endif
+   };
+
+   uint32_t surface_count = 0;
+   struct anv_pipeline_bind_map *map;
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
+      surface_count = map->surface_count;
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   uint32_t *blend_state_start = devinfo->ver >= 8 ?
+      pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
+   uint32_t *state_pos = blend_state_start;
+
+   state_pos += GENX(BLEND_STATE_length);
+#if GFX_VER >= 8
+   struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
+#endif
+   for (unsigned i = 0; i < surface_count; i++) {
+      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
+
+      /* All color attachments are at the beginning of the binding table */
+      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
+         break;
+
+      /* We can have at most 8 attachments */
+      assert(i < MAX_RTS);
+
+      if (cb == NULL || binding->index >= cb->attachment_count) {
+         state_pos = write_disabled_blend(state_pos);
+         continue;
+      }
+
+      const struct vk_color_blend_attachment_state *a =
+         &cb->attachments[binding->index];
+
+      struct GENX(BLEND_STATE_ENTRY) entry = {
+#if GFX_VER < 8
+         .AlphaToCoverageEnable = ms && ms->alpha_to_coverage_enable,
+         .AlphaToOneEnable = ms && ms->alpha_to_one_enable,
+#endif
+         .LogicOpEnable = cb->logic_op_enable,
+
+         /* Vulkan specification 1.2.168, VkLogicOp:
+          *
+          *   "Logical operations are controlled by the logicOpEnable and
+          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
+          *    logicOpEnable is VK_TRUE, then a logical operation selected by
+          *    logicOp is applied between each color attachment and the
+          *    fragment’s corresponding output value, and blending of all
+          *    attachments is treated as if it were disabled."
+          *
+          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
+          * BLEND_STATE_ENTRY:
+          *
+          *   "Enabling LogicOp and Color Buffer Blending at the same time is
+          *    UNDEFINED"
+          */
+         .ColorBufferBlendEnable = !cb->logic_op_enable && a->blend_enable,
+         .ColorClampRange = COLORCLAMP_RTFORMAT,
+         .PreBlendColorClampEnable = true,
+         .PostBlendColorClampEnable = true,
+         .SourceBlendFactor = vk_to_intel_blend[a->src_color_blend_factor],
+         .DestinationBlendFactor = vk_to_intel_blend[a->dst_color_blend_factor],
+         .ColorBlendFunction = vk_to_intel_blend_op[a->color_blend_op],
+         .SourceAlphaBlendFactor = vk_to_intel_blend[a->src_alpha_blend_factor],
+         .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dst_alpha_blend_factor],
+         .AlphaBlendFunction = vk_to_intel_blend_op[a->alpha_blend_op],
+      };
+
+      if (a->src_color_blend_factor != a->src_alpha_blend_factor ||
+          a->dst_color_blend_factor != a->dst_alpha_blend_factor ||
+          a->color_blend_op != a->alpha_blend_op) {
+#if GFX_VER >= 8
+         blend_state.IndependentAlphaBlendEnable = true;
+#else
+         entry.IndependentAlphaBlendEnable = true;
+#endif
+      }
+
+      /* The Dual Source Blending documentation says:
+       *
+       * "If SRC1 is included in a src/dst blend factor and
+       * a DualSource RT Write message is not used, results
+       * are UNDEFINED. (This reflects the same restriction in DX APIs,
+       * where undefined results are produced if “o1” is not written
+       * by a PS – there are no default values defined)."
+       *
+       * There is no way to gracefully fix this undefined situation
+       * so we just disable the blending to prevent possible issues.
+       */
+      if (!wm_prog_data->dual_src_blend &&
+          (is_dual_src_blend_factor(a->src_color_blend_factor) ||
+           is_dual_src_blend_factor(a->dst_color_blend_factor) ||
+           is_dual_src_blend_factor(a->src_alpha_blend_factor) ||
+           is_dual_src_blend_factor(a->dst_alpha_blend_factor))) {
+         vk_logw(VK_LOG_OBJS(&device->vk.base),
+                 "Enabled dual-src blend factors without writing both targets "
+                 "in the shader.  Disabling blending to avoid GPU hangs.");
+         entry.ColorBufferBlendEnable = false;
+      }
+
+      /* Our hardware applies the blend factor prior to the blend function
+       * regardless of what function is used.  Technically, this means the
+       * hardware can do MORE than GL or Vulkan specify.  However, it also
+       * means that, for MIN and MAX, we have to stomp the blend factor to
+       * ONE to make it a no-op.
+       */
+      if (a->color_blend_op == VK_BLEND_OP_MIN ||
+          a->color_blend_op == VK_BLEND_OP_MAX) {
+         entry.SourceBlendFactor = BLENDFACTOR_ONE;
+         entry.DestinationBlendFactor = BLENDFACTOR_ONE;
+      }
+      if (a->alpha_blend_op == VK_BLEND_OP_MIN ||
+          a->alpha_blend_op == VK_BLEND_OP_MAX) {
+         entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+         entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+      }
+      GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
+      state_pos += GENX(BLEND_STATE_ENTRY_length);
+#if GFX_VER >= 8
+      if (i == 0)
+         bs0 = entry;
+#endif
+   }
+
+#if GFX_VER >= 8
+   struct GENX(3DSTATE_PS_BLEND) blend = {
+      GENX(3DSTATE_PS_BLEND_header),
+   };
+   blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
+   blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
+   blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
+   blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
+   blend.SourceBlendFactor             = bs0.SourceBlendFactor;
+   blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
+   blend.AlphaTestEnable               = false;
+   blend.IndependentAlphaBlendEnable   = blend_state.IndependentAlphaBlendEnable;
+
+   GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
+#endif
+
+   GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
+}
+
+static void
+emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
+                  const struct vk_input_assembly_state *ia,
+                  const struct vk_viewport_state *vp,
+                  const struct vk_rasterization_state *rs)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   (void) wm_prog_data;
+
+   struct GENX(3DSTATE_CLIP) clip = {
+      GENX(3DSTATE_CLIP_header),
+   };
+
+   clip.ClipEnable               = true;
+   clip.StatisticsEnable         = true;
+   clip.EarlyCullEnable          = true;
+   clip.APIMode                  = pipeline->negative_one_to_one ? APIMODE_OGL : APIMODE_D3D;
+   clip.GuardbandClipTestEnable  = true;
+
+#if GFX_VER >= 8
+   clip.VertexSubPixelPrecisionSelect = _8Bit;
+#endif
+   clip.ClipMode = CLIPMODE_NORMAL;
+
+   switch (rs->provoking_vertex) {
+   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
+      clip.TriangleStripListProvokingVertexSelect = 0;
+      clip.LineStripListProvokingVertexSelect = 0;
+      clip.TriangleFanProvokingVertexSelect = 1;
+      break;
+
+   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
+      clip.TriangleStripListProvokingVertexSelect = 2;
+      clip.LineStripListProvokingVertexSelect = 1;
+      clip.TriangleFanProvokingVertexSelect = 2;
+      break;
+
+   default:
+      unreachable("Invalid provoking vertex mode");
+   }
+
+   clip.MinimumPointWidth = 0.125;
+   clip.MaximumPointWidth = 255.875;
+
+   /* TODO(mesh): Multiview. */
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const struct brw_vue_prog_data *last =
+         anv_pipeline_get_last_vue_prog_data(pipeline);
+
+      /* From the Vulkan 1.0.45 spec:
+       *
+       *    "If the last active vertex processing stage shader entry point's
+       *    interface does not include a variable decorated with
+       *    ViewportIndex, then the first viewport is used."
+       */
+      if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
+         clip.MaximumVPIndex = vp->viewport_count > 0 ?
+                               vp->viewport_count - 1 : 0;
+      } else {
+         clip.MaximumVPIndex = 0;
+      }
+
+      /* From the Vulkan 1.0.45 spec:
+       *
+       *    "If the last active vertex processing stage shader entry point's
+       *    interface does not include a variable decorated with Layer, then
+       *    the first layer is used."
+       */
+      clip.ForceZeroRTAIndexEnable =
+         !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+
+#if GFX_VER == 7
+      clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
+      clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
+#endif
+   } else if (anv_pipeline_is_mesh(pipeline)) {
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      if (vp && vp->viewport_count > 0 &&
+          mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+         clip.MaximumVPIndex = vp->viewport_count - 1;
+      }
+   }
+
+#if GFX_VER == 7
+   clip.FrontWinding            = genX(vk_to_intel_front_face)[rs->front_face];
+   clip.CullMode                = genX(vk_to_intel_cullmode)[rs->cull_mode];
+   clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
+#else
+   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+      wm_prog_data->uses_nonperspective_interp_modes : 0;
+#endif
+
+   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
+
+#if GFX_VERx10 >= 125
+   if (anv_pipeline_is_mesh(pipeline)) {
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+         clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
+         clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
+         clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
+      }
+   }
+#endif
+}
+
+static void
+emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
+                       const struct vk_rasterization_state *rs)
+{
+   const struct brw_vue_prog_data *prog_data =
+      anv_pipeline_get_last_vue_prog_data(pipeline);
+   const struct brw_vue_map *vue_map = &prog_data->vue_map;
+
+   nir_xfb_info *xfb_info;
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
+      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
+   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
+   else
+      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
+
+   if (xfb_info) {
+      struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
+      int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
+      int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
+
+      memset(so_decl, 0, sizeof(so_decl));
+
+      for (unsigned i = 0; i < xfb_info->output_count; i++) {
+         const nir_xfb_output_info *output = &xfb_info->outputs[i];
+         unsigned buffer = output->buffer;
+         unsigned stream = xfb_info->buffer_to_stream[buffer];
+
+         /* Our hardware is unusual in that it requires us to program SO_DECLs
+          * for fake "hole" components, rather than simply taking the offset
+          * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
+          * program as many size = 4 holes as we can, then a final hole to
+          * accommodate the final 1, 2, or 3 remaining.
+          */
+         int hole_dwords = (output->offset - next_offset[buffer]) / 4;
+         while (hole_dwords > 0) {
+            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
+               .HoleFlag = 1,
+               .OutputBufferSlot = buffer,
+               .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
+            };
+            hole_dwords -= 4;
+         }
+
+         int varying = output->location;
+         uint8_t component_mask = output->component_mask;
+         /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
+          * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
+          * - VARYING_SLOT_LAYER                  in VARYING_SLOT_PSIZ.y
+          * - VARYING_SLOT_VIEWPORT               in VARYING_SLOT_PSIZ.z
+          * - VARYING_SLOT_PSIZ                   in VARYING_SLOT_PSIZ.w
+          */
+         if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 0; // SO_DECL_COMPMASK_X
+         } else if (varying == VARYING_SLOT_LAYER) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
+         } else if (varying == VARYING_SLOT_VIEWPORT) {
+            varying = VARYING_SLOT_PSIZ;
+            component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
+         } else if (varying == VARYING_SLOT_PSIZ) {
+            component_mask = 1 << 3; // SO_DECL_COMPMASK_W
+         }
+
+         next_offset[buffer] = output->offset +
+                               __builtin_popcount(component_mask) * 4;
+
+         const int slot = vue_map->varying_to_slot[varying];
+         if (slot < 0) {
+            /* This can happen if the shader never writes to the varying.
+             * Insert a hole instead of actual varying data.
+             */
+            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
+               .HoleFlag = true,
+               .OutputBufferSlot = buffer,
+               .ComponentMask = component_mask,
+            };
+         } else {
+            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
+               .OutputBufferSlot = buffer,
+               .RegisterIndex = slot,
+               .ComponentMask = component_mask,
+            };
+         }
+      }
+
+      int max_decls = 0;
+      for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
+         max_decls = MAX2(max_decls, decls[s]);
+
+      uint8_t sbs[MAX_XFB_STREAMS] = { };
+      for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
+         if (xfb_info->buffers_written & (1 << b))
+            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
+      }
+
+      /* Wa_16011773973:
+       * If SOL is enabled and SO_DECL state has to be programmed,
+       *    1. Send 3D State SOL state with SOL disabled
+       *    2. Send SO_DECL NP state
+       *    3. Send 3D State SOL with SOL Enabled
+       */
+      if (intel_device_info_is_dg2(pipeline->base.device->info))
+         anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so);
+
+      uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
+                                     GENX(3DSTATE_SO_DECL_LIST),
+                                     .StreamtoBufferSelects0 = sbs[0],
+                                     .StreamtoBufferSelects1 = sbs[1],
+                                     .StreamtoBufferSelects2 = sbs[2],
+                                     .StreamtoBufferSelects3 = sbs[3],
+                                     .NumEntries0 = decls[0],
+                                     .NumEntries1 = decls[1],
+                                     .NumEntries2 = decls[2],
+                                     .NumEntries3 = decls[3]);
+
+      for (int i = 0; i < max_decls; i++) {
+         GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
+            &(struct GENX(SO_DECL_ENTRY)) {
+               .Stream0Decl = so_decl[0][i],
+               .Stream1Decl = so_decl[1][i],
+               .Stream2Decl = so_decl[2][i],
+               .Stream3Decl = so_decl[3][i],
+            });
+      }
+   }
+
+#if GFX_VER == 7
+#  define streamout_state_dw pipeline->gfx7.streamout_state
+#else
+#  define streamout_state_dw pipeline->gfx8.streamout_state
+#endif
+
+   struct GENX(3DSTATE_STREAMOUT) so = {
+      GENX(3DSTATE_STREAMOUT_header),
+   };
+
+   if (xfb_info) {
+      so.SOFunctionEnable = true;
+      so.SOStatisticsEnable = true;
+
+      switch (rs->provoking_vertex) {
+      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
+         so.ReorderMode = LEADING;
+         break;
+
+      case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
+         so.ReorderMode = TRAILING;
+         break;
+
+      default:
+         unreachable("Invalid provoking vertex mode");
+      }
+
+      so.RenderStreamSelect = rs->rasterization_stream;
+
+#if GFX_VER >= 8
+      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
+      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
+      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
+      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
+#else
+      pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
+      pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
+      pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
+      pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
+
+      /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
+       * is a bit inconvenient because we don't know what buffers will
+       * actually be enabled until draw time.  We do our best here by
+       * setting them based on buffers_written and we disable them
+       * as-needed at draw time by setting EndAddress = BaseAddress.
+       */
+      so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
+      so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
+      so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
+      so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
+#endif
+
+      int urb_entry_read_offset = 0;
+      int urb_entry_read_length =
+         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
+
+      /* We always read the whole vertex.  This could be reduced at some
+       * point by reading less and offsetting the register index in the
+       * SO_DECLs.
+       */
+      so.Stream0VertexReadOffset = urb_entry_read_offset;
+      so.Stream0VertexReadLength = urb_entry_read_length - 1;
+      so.Stream1VertexReadOffset = urb_entry_read_offset;
+      so.Stream1VertexReadLength = urb_entry_read_length - 1;
+      so.Stream2VertexReadOffset = urb_entry_read_offset;
+      so.Stream2VertexReadLength = urb_entry_read_length - 1;
+      so.Stream3VertexReadOffset = urb_entry_read_offset;
+      so.Stream3VertexReadLength = urb_entry_read_length - 1;
+   }
+
+   GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
+}
+
+static uint32_t
+get_sampler_count(const struct anv_shader_bin *bin)
+{
+   uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
+
+   /* We can potentially have way more than 32 samplers and that's ok.
+    * However, the 3DSTATE_XS packets only have 3 bits to specify how
+    * many to pre-fetch and all values above 4 are marked reserved.
+    */
+   return MIN2(count_by_4, 4);
+}
+
+static UNUSED struct anv_address
+get_scratch_address(struct anv_pipeline *pipeline,
+                    gl_shader_stage stage,
+                    const struct anv_shader_bin *bin)
+{
+   return (struct anv_address) {
+      .bo = anv_scratch_pool_alloc(pipeline->device,
+                                   &pipeline->device->scratch_pool,
+                                   stage, bin->prog_data->total_scratch),
+      .offset = 0,
+   };
+}
+
+static UNUSED uint32_t
+get_scratch_space(const struct anv_shader_bin *bin)
+{
+   return ffs(bin->prog_data->total_scratch / 2048);
+}
+
+static UNUSED uint32_t
+get_scratch_surf(struct anv_pipeline *pipeline,
+                 gl_shader_stage stage,
+                 const struct anv_shader_bin *bin)
+{
+   if (bin->prog_data->total_scratch == 0)
+      return 0;
+
+   struct anv_bo *bo =
+      anv_scratch_pool_alloc(pipeline->device,
+                             &pipeline->device->scratch_pool,
+                             stage, bin->prog_data->total_scratch);
+   anv_reloc_list_add_bo(pipeline->batch.relocs,
+                         pipeline->batch.alloc, bo);
+   return anv_scratch_pool_get_surf(pipeline->device,
+                                    &pipeline->device->scratch_pool,
+                                    bin->prog_data->total_scratch) >> 4;
+}
+
+static void
+emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
+{
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   const struct anv_shader_bin *vs_bin =
+      pipeline->shaders[MESA_SHADER_VERTEX];
+
+   assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
+      vs.Enable               = true;
+      vs.StatisticsEnable     = true;
+      vs.KernelStartPointer   = vs_bin->kernel.offset;
+#if GFX_VER >= 8
+      vs.SIMD8DispatchEnable  =
+         vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
+#endif
+
+      assert(!vs_prog_data->base.base.use_alt_mode);
+#if GFX_VER < 11
+      vs.SingleVertexDispatch       = false;
+#endif
+      vs.VectorMaskEnable           = false;
+      /* Wa_1606682166:
+       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
+       * Disable the Sampler state prefetch functionality in the SARB by
+       * programming 0xB000[30] to '1'.
+       */
+      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
+      vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
+      vs.FloatingPointMode          = IEEE754;
+      vs.IllegalOpcodeExceptionEnable = false;
+      vs.SoftwareExceptionEnable    = false;
+      vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
+
+      if (GFX_VER == 9 && devinfo->gt == 4 &&
+          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+         /* On Sky Lake GT4, we have experienced some hangs related to the VS
+          * cache and tessellation.  It is unknown exactly what is happening
+          * but the Haswell docs for the "VS Reference Count Full Force Miss
+          * Enable" field of the "Thread Mode" register refer to a HSW bug in
+          * which the VUE handle reference count would overflow resulting in
+          * internal reference counting bugs.  My (Jason's) best guess is that
+          * this bug cropped back up on SKL GT4 when we suddenly had more
+          * threads in play than any previous gfx9 hardware.
+          *
+          * What we do know for sure is that setting this bit when
+          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
+          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
+          * Disabling the vertex cache with tessellation shaders should only
+          * have a minor performance impact as the tessellation shaders are
+          * likely generating and processing far more geometry than the vertex
+          * stage.
+          */
+         vs.VertexCacheDisable = true;
+      }
+
+      vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
+      vs.VertexURBEntryReadOffset      = 0;
+      vs.DispatchGRFStartRegisterForURBData =
+         vs_prog_data->base.base.dispatch_grf_start_reg;
+
+#if GFX_VER >= 8
+      vs.UserClipDistanceClipTestEnableBitmask =
+         vs_prog_data->base.clip_distance_mask;
+      vs.UserClipDistanceCullTestEnableBitmask =
+         vs_prog_data->base.cull_distance_mask;
+#endif
+
+#if GFX_VERx10 >= 125
+      vs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
+#else
+      vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
+      vs.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
+#endif
+   }
+}
+
+static void
+emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
+                      const struct vk_tessellation_state *ts)
+{
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
+      return;
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct anv_shader_bin *tcs_bin =
+      pipeline->shaders[MESA_SHADER_TESS_CTRL];
+   const struct anv_shader_bin *tes_bin =
+      pipeline->shaders[MESA_SHADER_TESS_EVAL];
+
+   const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
+   const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
+      hs.Enable = true;
+      hs.StatisticsEnable = true;
+      hs.KernelStartPointer = tcs_bin->kernel.offset;
+      /* Wa_1606682166 */
+      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
+      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
+
+#if GFX_VER >= 12
+      /* Wa_1604578095:
+       *
+       *    Hang occurs when the number of max threads is less than 2 times
+       *    the number of instance count. The number of max threads must be
+       *    more than 2 times the number of instance count.
+       */
+      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
+#endif
+
+      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
+      hs.IncludeVertexHandles = true;
+      hs.InstanceCount = tcs_prog_data->instances - 1;
+
+      hs.VertexURBEntryReadLength = 0;
+      hs.VertexURBEntryReadOffset = 0;
+      hs.DispatchGRFStartRegisterForURBData =
+         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
+#if GFX_VER >= 12
+      hs.DispatchGRFStartRegisterForURBData5 =
+         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
+#endif
+
+#if GFX_VERx10 >= 125
+      hs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
+#else
+      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
+      hs.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
+#endif
+
+#if GFX_VER == 12
+      /*  Patch Count threshold specifies the maximum number of patches that
+       *  will be accumulated before a thread dispatch is forced.
+       */
+      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
+#endif
+
+#if GFX_VER >= 9
+      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+#endif
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
+      te.Partitioning = tes_prog_data->partitioning;
+
+      if (ts->domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
+         te.OutputTopology = tes_prog_data->output_topology;
+      } else {
+         /* When the origin is upper-left, we have to flip the winding order */
+         if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
+            te.OutputTopology = OUTPUT_TRI_CW;
+         } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
+            te.OutputTopology = OUTPUT_TRI_CCW;
+         } else {
+            te.OutputTopology = tes_prog_data->output_topology;
+         }
+      }
+
+      te.TEDomain = tes_prog_data->domain;
+      te.TEEnable = true;
+      te.MaximumTessellationFactorOdd = 63.0;
+      te.MaximumTessellationFactorNotOdd = 64.0;
+#if GFX_VERx10 >= 125
+      te.TessellationDistributionMode = TEDMODE_RR_FREE;
+      te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+      /* 64_TRIANGLES */
+      te.SmallPatchThreshold = 3;
+      /* 1K_TRIANGLES */
+      te.TargetBlockSize = 8;
+      /* 1K_TRIANGLES */
+      te.LocalBOPAccumulatorThreshold = 1;
+#endif
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
+      ds.Enable = true;
+      ds.StatisticsEnable = true;
+      ds.KernelStartPointer = tes_bin->kernel.offset;
+      /* Wa_1606682166 */
+      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
+      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
+      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
+
+      ds.ComputeWCoordinateEnable =
+         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+
+      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
+      ds.PatchURBEntryReadOffset = 0;
+      ds.DispatchGRFStartRegisterForURBData =
+         tes_prog_data->base.base.dispatch_grf_start_reg;
+
+#if GFX_VER >= 8
+#if GFX_VER < 11
+      ds.DispatchMode =
+         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
+            DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+            DISPATCH_MODE_SIMD4X2;
+#else
+      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
+#endif
+
+      ds.UserClipDistanceClipTestEnableBitmask =
+         tes_prog_data->base.clip_distance_mask;
+      ds.UserClipDistanceCullTestEnableBitmask =
+         tes_prog_data->base.cull_distance_mask;
+#endif
+
+#if GFX_VER >= 12
+      ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
+#endif
+#if GFX_VERx10 >= 125
+      ds.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
+#else
+      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
+      ds.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
+#endif
+   }
+}
+
+static void
+emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
+{
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct anv_shader_bin *gs_bin =
+      pipeline->shaders[MESA_SHADER_GEOMETRY];
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
+      return;
+   }
+
+   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
+      gs.Enable                  = true;
+      gs.StatisticsEnable        = true;
+      gs.KernelStartPointer      = gs_bin->kernel.offset;
+      gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
+
+      gs.SingleProgramFlow       = false;
+      gs.VectorMaskEnable        = false;
+      /* Wa_1606682166 */
+      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
+      gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
+      gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
+      gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
+
+      if (GFX_VER == 8) {
+         /* Broadwell is weird.  It needs us to divide by 2. */
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
+      } else {
+         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
+      }
+
+      gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+      gs.OutputTopology          = gs_prog_data->output_topology;
+      gs.ControlDataFormat       = gs_prog_data->control_data_format;
+      gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
+      gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
+      gs.ReorderMode             = TRAILING;
+
+#if GFX_VER >= 8
+      gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
+      gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
+      gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
+                                   gs_prog_data->static_vertex_count : 0;
+#endif
+
+      gs.VertexURBEntryReadOffset = 0;
+      gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
+      gs.DispatchGRFStartRegisterForURBData =
+         gs_prog_data->base.base.dispatch_grf_start_reg;
+
+#if GFX_VER >= 8
+      gs.UserClipDistanceClipTestEnableBitmask =
+         gs_prog_data->base.clip_distance_mask;
+      gs.UserClipDistanceCullTestEnableBitmask =
+         gs_prog_data->base.cull_distance_mask;
+#endif
+
+#if GFX_VERx10 >= 125
+      gs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
+#else
+      gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
+      gs.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
+#endif
+   }
+}
+
+static void
+emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
+                const struct vk_input_assembly_state *ia,
+                const struct vk_rasterization_state *rs,
+                const struct vk_multisample_state *ms,
+                const struct vk_color_blend_state *cb,
+                const struct vk_render_pass_state *rp)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   struct GENX(3DSTATE_WM) wm = {
+      GENX(3DSTATE_WM_header),
+   };
+   wm.StatisticsEnable                    = true;
+   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
+   wm.LineAntialiasingRegionWidth         = _10pixels;
+   wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
+
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      if (wm_prog_data->early_fragment_tests) {
+            wm.EarlyDepthStencilControl         = EDSC_PREPS;
+      } else if (wm_prog_data->has_side_effects) {
+         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
+      } else {
+         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
+      }
+
+#if GFX_VER >= 8
+      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+       * doesn't take into account KillPixels when no depth or stencil
+       * writes are enabled.  In order for occlusion queries to work
+       * correctly with no attachments, we need to force-enable PS thread
+       * dispatch.
+       *
+       * The BDW docs are pretty clear that that this bit isn't validated
+       * and probably shouldn't be used in production:
+       *
+       *    "This must always be set to Normal. This field should not be
+       *    tested for functional validation."
+       *
+       * Unfortunately, however, the other mechanism we have for doing this
+       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+       * Given two bad options, we choose the one which works.
+       */
+      pipeline->force_fragment_thread_dispatch =
+         wm_prog_data->has_side_effects ||
+         wm_prog_data->uses_kill;
+#endif
+
+      wm.BarycentricInterpolationMode =
+         wm_prog_data->barycentric_interp_modes;
+
+#if GFX_VER < 8
+      wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
+      wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
+      wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
+      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+
+      /* If the subpass has a depth or stencil self-dependency, then we
+       * need to force the hardware to do the depth/stencil write *after*
+       * fragment shader execution.  Otherwise, the writes may hit memory
+       * before we get around to fetching from the input attachment and we
+       * may get the depth or stencil value from the current draw rather
+       * than the previous one.
+       */
+      wm.PixelShaderKillsPixel         = rp->depth_self_dependency ||
+                                         rp->stencil_self_dependency ||
+                                         wm_prog_data->uses_kill;
+
+      pipeline->force_fragment_thread_dispatch =
+         wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
+         wm_prog_data->has_side_effects ||
+         wm.PixelShaderKillsPixel;
+
+      if (ms != NULL && ms->rasterization_samples > 1) {
+         if (wm_prog_data->persample_dispatch) {
+            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+         } else {
+            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+         }
+      } else {
+         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+      }
+#endif
+
+      wm.LineStippleEnable = rs->line.stipple.enable;
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
+   GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
+}
+
+static void
+emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
+                const struct vk_multisample_state *ms,
+                const struct vk_color_blend_state *cb)
+{
+   UNUSED const struct intel_device_info *devinfo =
+      pipeline->base.device->info;
+   const struct anv_shader_bin *fs_bin =
+      pipeline->shaders[MESA_SHADER_FRAGMENT];
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
+#if GFX_VER == 7
+         /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
+          * we don't at least set the maximum number of threads.
+          */
+         ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
+#endif
+      }
+      return;
+   }
+
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+#if GFX_VER < 8
+   /* The hardware wedges if you have this bit set but don't turn on any dual
+    * source blend factors.
+    */
+   bool dual_src_blend = false;
+   if (wm_prog_data->dual_src_blend && cb) {
+      for (uint32_t i = 0; i < cb->attachment_count; i++) {
+         const struct vk_color_blend_attachment_state *a =
+            &cb->attachments[i];
+
+         if (a->blend_enable &&
+             (is_dual_src_blend_factor(a->src_color_blend_factor) ||
+              is_dual_src_blend_factor(a->dst_color_blend_factor) ||
+              is_dual_src_blend_factor(a->src_alpha_blend_factor) ||
+              is_dual_src_blend_factor(a->dst_alpha_blend_factor))) {
+            dual_src_blend = true;
+            break;
+         }
+      }
+   }
+#endif
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
+      ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
+      ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
+      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
+
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       */
+      if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
+          ms != NULL && ms->rasterization_samples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
+      ps.KernelStartPointer0 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+      ps.KernelStartPointer1 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+      ps.KernelStartPointer2 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+
+      ps.SingleProgramFlow          = false;
+      ps.VectorMaskEnable           = GFX_VER >= 8 &&
+                                      wm_prog_data->uses_vmask;
+      /* Wa_1606682166 */
+      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
+      ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
+      ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
+                                      wm_prog_data->base.ubo_ranges[0].length;
+      ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
+                                      POSOFFSET_SAMPLE: POSOFFSET_NONE;
+#if GFX_VER < 8
+      ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
+      ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+      ps.DualSourceBlendEnable      = dual_src_blend;
+#endif
+
+#if GFX_VERx10 == 75
+      /* Haswell requires the sample mask to be set in this packet as well
+       * as in 3DSTATE_SAMPLE_MASK; the values should match.
+       */
+      ps.SampleMask                 = 0xff;
+#endif
+
+#if GFX_VER >= 8
+      ps.MaximumNumberofThreadsPerPSD =
+         devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
+#else
+      ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
+#endif
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+
+#if GFX_VERx10 >= 125
+      ps.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
+#else
+      ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
+      ps.ScratchSpaceBasePointer =
+         get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
+#endif
+   }
+}
+
+#if GFX_VER >= 8
+static void
+emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
+                      const struct vk_rasterization_state *rs,
+                      const struct vk_render_pass_state *rp)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
+      return;
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
+      ps.PixelShaderValid              = true;
+      ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
+      ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
+      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
+      ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
+      ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
+      ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
+
+      /* If the subpass has a depth or stencil self-dependency, then we need
+       * to force the hardware to do the depth/stencil write *after* fragment
+       * shader execution.  Otherwise, the writes may hit memory before we get
+       * around to fetching from the input attachment and we may get the depth
+       * or stencil value from the current draw rather than the previous one.
+       */
+      ps.PixelShaderKillsPixel         = rp->depth_self_dependency ||
+                                         rp->stencil_self_dependency ||
+                                         wm_prog_data->uses_kill;
+
+#if GFX_VER >= 9
+      ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+      ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
+
+      ps.InputCoverageMaskState = ICMS_NONE;
+      assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
+      if (!wm_prog_data->uses_sample_mask)
+         ps.InputCoverageMaskState = ICMS_NONE;
+      else if (wm_prog_data->per_coarse_pixel_dispatch)
+         ps.InputCoverageMaskState  = ICMS_NORMAL;
+      else if (wm_prog_data->post_depth_coverage)
+         ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
+      else
+         ps.InputCoverageMaskState = ICMS_NORMAL;
+#else
+      ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+#endif
+
+#if GFX_VER >= 11
+      ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
+         wm_prog_data->uses_depth_w_coefficients;
+      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
+#endif
+#if GFX_VERx10 >= 125
+      /* TODO: We should only require this when the last geometry shader uses
+       *       a fragment shading rate that is not constant.
+       */
+      ps.EnablePSDependencyOnCPsizeChange = wm_prog_data->per_coarse_pixel_dispatch;
+#endif
+   }
+}
+#endif
+
+static void
+emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
+{
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+      vfs.StatisticsEnable = true;
+   }
+}
+
+static void
+compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
+                   const struct vk_multisample_state *ms,
+                   const struct vk_render_pass_state *rp)
+{
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      pipeline->kill_pixel = false;
+      return;
+   }
+
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+
+   /* This computes the KillPixel portion of the computation for whether or
+    * not we want to enable the PMA fix on gfx8 or gfx9.  It's given by this
+    * chunk of the giant formula:
+    *
+    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
+    *
+    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
+    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
+    * of an alpha test.
+    */
+   pipeline->kill_pixel =
+      rp->depth_self_dependency ||
+      rp->stencil_self_dependency ||
+      wm_prog_data->uses_kill ||
+      wm_prog_data->uses_omask ||
+      (ms && ms->alpha_to_coverage_enable);
+}
+
+#if GFX_VER == 12
+static void
+emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
+                                   const struct vk_render_pass_state *rp)
+{
+   const int replication_count =
+      anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
+
+   assert(replication_count >= 1);
+   if (replication_count == 1) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      return;
+   }
+
+   uint32_t view_mask = rp->view_mask;
+   assert(replication_count == util_bitcount(view_mask));
+   assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+      pr.ReplicaMask = (1 << replication_count) - 1;
+      pr.ReplicationCount = replication_count - 1;
+
+      int i = 0;
+      u_foreach_bit(view_index, rp->view_mask) {
+         pr.RTAIOffset[i] = view_index;
+         i++;
+      }
+   }
+}
+#endif
+
+#if GFX_VERx10 >= 125
+static void
+emit_task_state(struct anv_graphics_pipeline *pipeline)
+{
+   assert(anv_pipeline_is_mesh(pipeline));
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero);
+      return;
+   }
+
+   const struct anv_shader_bin *task_bin = pipeline->shaders[MESA_SHADER_TASK];
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), tc) {
+      tc.TaskShaderEnable = true;
+      tc.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_TASK, task_bin);
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct brw_cs_dispatch_info task_dispatch =
+      brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_SHADER), task) {
+      task.KernelStartPointer                = task_bin->kernel.offset;
+      task.SIMDSize                          = task_dispatch.simd_size / 16;
+      task.MessageSIMD                       = task.SIMDSize;
+      task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
+      task.ExecutionMask                     = task_dispatch.right_mask;
+      task.LocalXMaximum                     = task_dispatch.group_size - 1;
+      task.EmitLocalIDX                      = true;
+
+      task.NumberofBarriers                  = task_prog_data->base.uses_barrier;
+      task.SharedLocalMemorySize             =
+         encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
+
+      /*
+       * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
+       * of a buffer with push constants and descriptor set table and
+       * InlineData[2:7] will be used for first few push constants.
+       */
+      task.EmitInlineParameter = true;
+
+      task.XP0Required = task_prog_data->uses_drawid;
+   }
+
+   /* Recommended values from "Task and Mesh Distribution Programming". */
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+      redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
+      redistrib.SmallTaskThreshold = 1; /* 2^N */
+      redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
+      redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
+
+      /* TODO: We have an unknown issue with Task Payload when task redistribution
+       * is enabled. Disable it for now.
+       * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/7141
+       */
+      redistrib.TaskRedistributionMode = TASKREDISTRIB_OFF;
+   }
+}
+
+static void
+emit_mesh_state(struct anv_graphics_pipeline *pipeline)
+{
+   assert(anv_pipeline_is_mesh(pipeline));
+
+   const struct anv_shader_bin *mesh_bin = pipeline->shaders[MESA_SHADER_MESH];
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), mc) {
+      mc.MeshShaderEnable = true;
+      mc.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_MESH, mesh_bin);
+
+      /* TODO(mesh): MaximumNumberofThreadGroups. */
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+   const struct brw_cs_dispatch_info mesh_dispatch =
+      brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
+
+   const unsigned output_topology =
+      mesh_prog_data->primitive_type == SHADER_PRIM_POINTS ? OUTPUT_POINT :
+      mesh_prog_data->primitive_type == SHADER_PRIM_LINES  ? OUTPUT_LINE :
+                                                             OUTPUT_TRI;
+
+   uint32_t index_format;
+   switch (mesh_prog_data->index_format) {
+   case BRW_INDEX_FORMAT_U32:
+      index_format = INDEX_U32;
+      break;
+   default:
+      unreachable("invalid index format");
+   }
+
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_SHADER), mesh) {
+      mesh.KernelStartPointer                = mesh_bin->kernel.offset;
+      mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
+      mesh.MessageSIMD                       = mesh.SIMDSize;
+      mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
+      mesh.ExecutionMask                     = mesh_dispatch.right_mask;
+      mesh.LocalXMaximum                     = mesh_dispatch.group_size - 1;
+      mesh.EmitLocalIDX                      = true;
+
+      mesh.MaximumPrimitiveCount             = mesh_prog_data->map.max_primitives - 1;
+      mesh.OutputTopology                    = output_topology;
+      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
+      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
+      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
+      mesh.IndexFormat                       = index_format;
+
+      mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;
+      mesh.SharedLocalMemorySize             =
+         encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
+
+      /*
+       * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
+       * of a buffer with push constants and descriptor set table and
+       * InlineData[2:7] will be used for first few push constants.
+       */
+      mesh.EmitInlineParameter = true;
+
+      mesh.XP0Required = mesh_prog_data->uses_drawid;
+   }
+
+   /* Recommended values from "Task and Mesh Distribution Programming". */
+   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_DISTRIB), distrib) {
+      distrib.DistributionMode = MESH_RR_FREE;
+      distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 8 : 9; /* 2^N thread groups */
+      distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 5 : 3; /* 2^N thread groups */
+   }
+}
+#endif
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+                             const struct vk_graphics_pipeline_state *state)
+{
+   enum intel_urb_deref_block_size urb_deref_block_size;
+   emit_urb_setup(pipeline, &urb_deref_block_size);
+
+   assert(state->rs != NULL);
+   emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
+                           urb_deref_block_size);
+   emit_ms_state(pipeline, state->ms);
+   emit_cb_state(pipeline, state->cb, state->ms);
+   compute_kill_pixel(pipeline, state->ms, state->rp);
+
+   emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
+
+#if GFX_VER == 12
+   emit_3dstate_primitive_replication(pipeline, state->rp);
+#endif
+
+#if 0
+   /* From gfx7_vs_state.c */
+
+   /**
+    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
+    * Geometry > Geometry Shader > State:
+    *
+    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
+    *     whole fixed function pipeline when the GS enable changes value in
+    *     the 3DSTATE_GS."
+    *
+    * The hardware architects have clarified that in this context "flush the
+    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
+    * Stall" bit set.
+    */
+   if (device->info->platform == INTEL_PLATFORM_IVB)
+      gfx7_emit_vs_workaround_flush(brw);
+#endif
+
+   if (anv_pipeline_is_primitive(pipeline)) {
+      emit_vertex_input(pipeline, state->vi);
+
+      emit_3dstate_vs(pipeline);
+      emit_3dstate_hs_te_ds(pipeline, state->ts);
+      emit_3dstate_gs(pipeline);
+
+      emit_3dstate_vf_statistics(pipeline);
+
+      emit_3dstate_streamout(pipeline, state->rs);
+
+#if GFX_VERx10 >= 125
+      const struct anv_device *device = pipeline->base.device;
+      /* Disable Mesh. */
+      if (device->physical->vk.supported_extensions.NV_mesh_shader) {
+         anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_MESH_CONTROL), zero);
+         anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TASK_CONTROL), zero);
+      }
+#endif
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
+
+      /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
+       * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
+       */
+      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), so) {}
+
+#if GFX_VERx10 >= 125
+      emit_task_state(pipeline);
+      emit_mesh_state(pipeline);
+#endif
+   }
+
+   emit_3dstate_sbe(pipeline);
+   emit_3dstate_wm(pipeline, state->ia, state->rs,
+                   state->ms, state->cb, state->rp);
+   emit_3dstate_ps(pipeline, state->ms, state->cb);
+#if GFX_VER >= 8
+   emit_3dstate_ps_extra(pipeline, state->rs, state->rp);
+#endif
+}
+
+#if GFX_VERx10 >= 125
+
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
+{
+   struct anv_device *device = pipeline->base.device;
+   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
+   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
+
+   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
+   const struct intel_device_info *devinfo = device->info;
+
+   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total;
+      cfe.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
+   }
+}
+
+#else /* #if GFX_VERx10 >= 125 */
+
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
+{
+   struct anv_device *device = pipeline->base.device;
+   const struct intel_device_info *devinfo = device->info;
+   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
+
+   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
+
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
+   const uint32_t vfe_curbe_allocation =
+      ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+            cs_prog_data->push.cross_thread.regs, 2);
+
+   const struct anv_shader_bin *cs_bin = pipeline->cs;
+
+   anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
+#if GFX_VER > 7
+      vfe.StackSize              = 0;
+#else
+      vfe.GPGPUMode              = true;
+#endif
+      vfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total - 1;
+      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
+#if GFX_VER < 11
+      vfe.ResetGatewayTimer      = true;
+#endif
+#if GFX_VER <= 8
+      vfe.BypassGatewayControl   = true;
+#endif
+      vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
+      vfe.CURBEAllocationSize    = vfe_curbe_allocation;
+
+      if (cs_bin->prog_data->total_scratch) {
+         if (GFX_VER >= 8) {
+            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(cs_bin->prog_data->total_scratch) - 11;
+         } else if (GFX_VERx10 == 75) {
+            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(cs_bin->prog_data->total_scratch) - 12;
+         } else {
+            /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
+             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+             */
+            vfe.PerThreadScratchSpace =
+               cs_bin->prog_data->total_scratch / 1024 - 1;
+         }
+         vfe.ScratchSpaceBasePointer =
+            get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
+      }
+   }
+
+   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+      .KernelStartPointer     =
+         cs_bin->kernel.offset +
+         brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
+
+      /* Wa_1606682166 */
+      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
+      /* We add 1 because the CS indirect parameters buffer isn't accounted
+       * for in bind_map.surface_count.
+       */
+      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
+      .BarrierEnable          = cs_prog_data->uses_barrier,
+      .SharedLocalMemorySize  =
+         encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
+
+#if GFX_VERx10 != 75
+      .ConstantURBEntryReadOffset = 0,
+#endif
+      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
+#if GFX_VERx10 >= 75
+      .CrossThreadConstantDataReadLength =
+         cs_prog_data->push.cross_thread.regs,
+#endif
+#if GFX_VER >= 12
+      /* TODO: Check if we are missing workarounds and enable mid-thread
+       * preemption.
+       *
+       * We still have issues with mid-thread preemption (it was already
+       * disabled by the kernel on gfx11, due to missing workarounds). It's
+       * possible that we are just missing some workarounds, and could enable
+       * it later, but for now let's disable it to fix a GPU in compute in Car
+       * Chase (and possibly more).
+       */
+      .ThreadPreemptionDisable = true,
+#endif
+
+      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+   };
+   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
+                                        pipeline->interface_descriptor_data,
+                                        &desc);
+}
+
+#endif /* #if GFX_VERx10 >= 125 */
+
+#if GFX_VERx10 >= 125
+
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
+{
+   for (uint32_t i = 0; i < pipeline->group_count; i++) {
+      struct anv_rt_shader_group *group = &pipeline->groups[i];
+
+      switch (group->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
+         struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
+         sh.General = anv_shader_bin_get_bsr(group->general, 32);
+         GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         break;
+      }
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
+         struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
+         if (group->closest_hit)
+            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
+         if (group->any_hit)
+            sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
+         GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         break;
+      }
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
+         struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
+         if (group->closest_hit)
+            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
+         sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
+         GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         break;
+      }
+
+      default:
+         unreachable("Invalid shader group type");
+      }
+   }
+}
+
+#else
+
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
+{
+   unreachable("Ray tracing not supported");
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan_hasvk/genX_query.c b/src/intel/vulkan_hasvk/genX_query.c
new file mode 100644
index 00000000000..8c20e2cdfe1
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_query.c
@@ -0,0 +1,1530 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "util/os_time.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+/* We reserve :
+ *    - GPR 14 for perf queries
+ *    - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+#include "perf/intel_perf.h"
+#include "perf/intel_perf_mdapi.h"
+#include "perf/intel_perf_regs.h"
+
+#include "vk_util.h"
+
+static struct anv_address
+anv_query_address(struct anv_query_pool *pool, uint32_t query)
+{
+   return (struct anv_address) {
+      .bo = pool->bo,
+      .offset = query * pool->stride,
+   };
+}
+
+VkResult genX(CreateQueryPool)(
+    VkDevice                                    _device,
+    const VkQueryPoolCreateInfo*                pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkQueryPool*                                pQueryPool)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct anv_physical_device *pdevice = device->physical;
+#if GFX_VER >= 8
+   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
+   struct intel_perf_counter_pass *counter_pass;
+   struct intel_perf_query_info **pass_query;
+   uint32_t n_passes = 0;
+#endif
+   uint32_t data_offset = 0;
+   VK_MULTIALLOC(ma);
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
+
+   /* Query pool slots are made up of some number of 64-bit values packed
+    * tightly together. For most query types have the first 64-bit value is
+    * the "available" bit which is 0 when the query is unavailable and 1 when
+    * it is available. The 64-bit values that follow are determined by the
+    * type of query.
+    *
+    * For performance queries, we have a requirement to align OA reports at
+    * 64bytes so we put those first and have the "available" bit behind
+    * together with some other counters.
+    */
+   uint32_t uint64s_per_slot = 0;
+
+   VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
+
+   VkQueryPipelineStatisticFlags pipeline_statistics = 0;
+   switch (pCreateInfo->queryType) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      /* Occlusion queries have two values: begin and end. */
+      uint64s_per_slot = 1 + 2;
+      break;
+   case VK_QUERY_TYPE_TIMESTAMP:
+      /* Timestamps just have the one timestamp value */
+      uint64s_per_slot = 1 + 1;
+      break;
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+      pipeline_statistics = pCreateInfo->pipelineStatistics;
+      /* We're going to trust this field implicitly so we need to ensure that
+       * no unhandled extension bits leak in.
+       */
+      pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
+
+      /* Statistics queries have a min and max for every statistic */
+      uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
+      break;
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      /* Transform feedback queries are 4 values, begin/end for
+       * written/available.
+       */
+      uint64s_per_slot = 1 + 4;
+      break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      const struct intel_perf_query_field_layout *layout =
+         &pdevice->perf->query_layout;
+
+      uint64s_per_slot = 2; /* availability + marker */
+      /* Align to the requirement of the layout */
+      uint64s_per_slot = align_u32(uint64s_per_slot,
+                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+      data_offset = uint64s_per_slot * sizeof(uint64_t);
+      /* Add the query data for begin & end commands */
+      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
+      break;
+   }
+#if GFX_VER >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      const struct intel_perf_query_field_layout *layout =
+         &pdevice->perf->query_layout;
+
+      perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
+                                             QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+      n_passes = intel_perf_get_n_passes(pdevice->perf,
+                                         perf_query_info->pCounterIndices,
+                                         perf_query_info->counterIndexCount,
+                                         NULL);
+      vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
+                             perf_query_info->counterIndexCount);
+      vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
+                             n_passes);
+      uint64s_per_slot = 4 /* availability + small batch */;
+      /* Align to the requirement of the layout */
+      uint64s_per_slot = align_u32(uint64s_per_slot,
+                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+      data_offset = uint64s_per_slot * sizeof(uint64_t);
+      /* Add the query data for begin & end commands */
+      uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
+      /* Multiply by the number of passes */
+      uint64s_per_slot *= n_passes;
+      break;
+   }
+#endif
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      /* Query has two values: begin and end. */
+      uint64s_per_slot = 1 + 2;
+      break;
+   default:
+      assert(!"Invalid query type");
+   }
+
+   if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
+                             VK_OBJECT_TYPE_QUERY_POOL))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   pool->type = pCreateInfo->queryType;
+   pool->pipeline_statistics = pipeline_statistics;
+   pool->stride = uint64s_per_slot * sizeof(uint64_t);
+   pool->slots = pCreateInfo->queryCount;
+
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+      pool->data_offset = data_offset;
+      pool->snapshot_size = (pool->stride - data_offset) / 2;
+   }
+#if GFX_VER >= 8
+   else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      pool->pass_size = pool->stride / n_passes;
+      pool->data_offset = data_offset;
+      pool->snapshot_size = (pool->pass_size - data_offset) / 2;
+      pool->n_counters = perf_query_info->counterIndexCount;
+      pool->counter_pass = counter_pass;
+      intel_perf_get_counters_passes(pdevice->perf,
+                                     perf_query_info->pCounterIndices,
+                                     perf_query_info->counterIndexCount,
+                                     pool->counter_pass);
+      pool->n_passes = n_passes;
+      pool->pass_query = pass_query;
+      intel_perf_get_n_passes(pdevice->perf,
+                              perf_query_info->pCounterIndices,
+                              perf_query_info->counterIndexCount,
+                              pool->pass_query);
+   }
+#endif
+
+   uint64_t size = pool->slots * (uint64_t)pool->stride;
+   result = anv_device_alloc_bo(device, "query-pool", size,
+                                ANV_BO_ALLOC_MAPPED |
+                                ANV_BO_ALLOC_SNOOPED,
+                                0 /* explicit_address */,
+                                &pool->bo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+#if GFX_VER >= 8
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t p = 0; p < pool->n_passes; p++) {
+         struct mi_builder b;
+         struct anv_batch batch = {
+            .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
+            .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
+         };
+         batch.next = batch.start;
+
+         mi_builder_init(&b, device->info, &batch);
+         mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
+                      mi_imm(p * (uint64_t)pool->pass_size));
+         anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+      }
+   }
+#endif
+
+   *pQueryPool = anv_query_pool_to_handle(pool);
+
+   return VK_SUCCESS;
+
+ fail:
+   vk_free2(&device->vk.alloc, pAllocator, pool);
+
+   return result;
+}
+
+void genX(DestroyQueryPool)(
+    VkDevice                                    _device,
+    VkQueryPool                                 _pool,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
+
+   if (!pool)
+      return;
+
+   anv_device_release_bo(device, pool->bo);
+   vk_object_free(&device->vk, pAllocator, pool);
+}
+
+#if GFX_VER >= 8
+/**
+ * VK_KHR_performance_query layout  :
+ *
+ * --------------------------------------------
+ * |       availability (8b)       | |        |
+ * |-------------------------------| |        |
+ * |      Small batch loading      | |        |
+ * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
+ * |            (24b)              | | Pass 0 |
+ * |-------------------------------| |        |
+ * |       some padding (see       | |        |
+ * | query_field_layout:alignment) | |        |
+ * |-------------------------------| |        |
+ * |           query data          | |        |
+ * | (2 * query_field_layout:size) | |        |
+ * |-------------------------------|--        | Query 0
+ * |       availability (8b)       | |        |
+ * |-------------------------------| |        |
+ * |      Small batch loading      | |        |
+ * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
+ * |            (24b)              | | Pass 1 |
+ * |-------------------------------| |        |
+ * |       some padding (see       | |        |
+ * | query_field_layout:alignment) | |        |
+ * |-------------------------------| |        |
+ * |           query data          | |        |
+ * | (2 * query_field_layout:size) | |        |
+ * |-------------------------------|-----------
+ * |       availability (8b)       | |        |
+ * |-------------------------------| |        |
+ * |      Small batch loading      | |        |
+ * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
+ * |            (24b)              | | Pass 0 |
+ * |-------------------------------| |        |
+ * |       some padding (see       | |        |
+ * | query_field_layout:alignment) | |        |
+ * |-------------------------------| |        |
+ * |           query data          | |        |
+ * | (2 * query_field_layout:size) | |        |
+ * |-------------------------------|--        | Query 1
+ * |               ...             | |        |
+ * --------------------------------------------
+ */
+
+static uint64_t
+khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
+{
+   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
+}
+
+static uint64_t
+khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
+{
+   return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
+      pool->data_offset + (end ? pool->snapshot_size : 0);
+}
+
+static struct anv_address
+khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
+{
+   return anv_address_add(
+      (struct anv_address) { .bo = pool->bo, },
+      khr_perf_query_availability_offset(pool, query, pass));
+}
+
+static struct anv_address
+khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
+{
+   return anv_address_add(
+      (struct anv_address) { .bo = pool->bo, },
+      khr_perf_query_data_offset(pool, query, pass, end));
+}
+
+static bool
+khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return false;
+
+   if (cmd_buffer->self_mod_locations)
+      return true;
+
+   struct anv_device *device = cmd_buffer->device;
+   const struct anv_physical_device *pdevice = device->physical;
+
+   cmd_buffer->self_mod_locations =
+      vk_alloc(&cmd_buffer->vk.pool->alloc,
+               pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (!cmd_buffer->self_mod_locations) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return false;
+   }
+
+   return true;
+}
+#endif
+
+/**
+ * VK_INTEL_performance_query layout :
+ *
+ * ---------------------------------
+ * |       availability (8b)       |
+ * |-------------------------------|
+ * |          marker (8b)          |
+ * |-------------------------------|
+ * |       some padding (see       |
+ * | query_field_layout:alignment) |
+ * |-------------------------------|
+ * |           query data          |
+ * | (2 * query_field_layout:size) |
+ * ---------------------------------
+ */
+
+static uint32_t
+intel_perf_marker_offset(void)
+{
+   return 8;
+}
+
+static uint32_t
+intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
+{
+   return pool->data_offset + (end ? pool->snapshot_size : 0);
+}
+
+static void
+cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
+                       uint32_t value_index, uint64_t result)
+{
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      uint64_t *dst64 = dst_slot;
+      dst64[value_index] = result;
+   } else {
+      uint32_t *dst32 = dst_slot;
+      dst32[value_index] = result;
+   }
+}
+
+static void *
+query_slot(struct anv_query_pool *pool, uint32_t query)
+{
+   return pool->bo->map + query * pool->stride;
+}
+
+static bool
+query_is_available(struct anv_query_pool *pool, uint32_t query)
+{
+#if GFX_VER >= 8
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t p = 0; p < pool->n_passes; p++) {
+         volatile uint64_t *slot =
+            pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
+         if (!slot[0])
+            return false;
+      }
+      return true;
+   }
+#endif
+
+   return *(volatile uint64_t *)query_slot(pool, query);
+}
+
+static VkResult
+wait_for_available(struct anv_device *device,
+                   struct anv_query_pool *pool, uint32_t query)
+{
+   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC);
+
+   while (os_time_get_nano() < abs_timeout_ns) {
+      if (query_is_available(pool, query))
+         return VK_SUCCESS;
+      VkResult status = vk_device_check_status(&device->vk);
+      if (status != VK_SUCCESS)
+         return status;
+   }
+
+   return vk_device_set_lost(&device->vk, "query timeout");
+}
+
+VkResult genX(GetQueryPoolResults)(
+    VkDevice                                    _device,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    size_t                                      dataSize,
+    void*                                       pData,
+    VkDeviceSize                                stride,
+    VkQueryResultFlags                          flags)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
+          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
+          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
+          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
+          pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
+
+   if (vk_device_is_lost(&device->vk))
+      return VK_ERROR_DEVICE_LOST;
+
+   if (pData == NULL)
+      return VK_SUCCESS;
+
+   void *data_end = pData + dataSize;
+
+   VkResult status = VK_SUCCESS;
+   for (uint32_t i = 0; i < queryCount; i++) {
+      bool available = query_is_available(pool, firstQuery + i);
+
+      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
+         status = wait_for_available(device, pool, firstQuery + i);
+         if (status != VK_SUCCESS) {
+            return status;
+         }
+
+         available = true;
+      }
+
+      /* From the Vulkan 1.0.42 spec:
+       *
+       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
+       *    both not set then no result values are written to pData for
+       *    queries that are in the unavailable state at the time of the call,
+       *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
+       *    availability state is still written to pData for those queries if
+       *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
+       *
+       * From VK_KHR_performance_query :
+       *
+       *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
+       *     that the result should contain the number of counters that were recorded
+       *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
+       */
+      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
+
+      uint32_t idx = 0;
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         if (write_results) {
+            /* From the Vulkan 1.2.132 spec:
+             *
+             *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
+             *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
+             *    is unavailable, an intermediate result value between zero and
+             *    the final result value is written to pData for that query."
+             */
+            uint64_t result = available ? slot[2] - slot[1] : 0;
+            cpu_write_query_result(pData, flags, idx, result);
+         }
+         idx++;
+         break;
+      }
+
+      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         uint32_t statistics = pool->pipeline_statistics;
+         while (statistics) {
+            uint32_t stat = u_bit_scan(&statistics);
+            if (write_results) {
+               uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
+
+               /* WaDividePSInvocationCountBy4:HSW,BDW */
+               if ((device->info->ver == 8 || device->info->verx10 == 75) &&
+                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
+                  result >>= 2;
+
+               cpu_write_query_result(pData, flags, idx, result);
+            }
+            idx++;
+         }
+         assert(idx == util_bitcount(pool->pipeline_statistics));
+         break;
+      }
+
+      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         if (write_results)
+            cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
+         idx++;
+         if (write_results)
+            cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
+         idx++;
+         break;
+      }
+
+      case VK_QUERY_TYPE_TIMESTAMP: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         if (write_results)
+            cpu_write_query_result(pData, flags, idx, slot[1]);
+         idx++;
+         break;
+      }
+
+#if GFX_VER >= 8
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+         const struct anv_physical_device *pdevice = device->physical;
+         assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
+                          VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            const struct intel_perf_query_info *query = pool->pass_query[p];
+            struct intel_perf_query_result result;
+            intel_perf_query_result_clear(&result);
+            intel_perf_query_result_accumulate_fields(&result, query,
+                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
+                                                      pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
+                                                      false /* no_oa_accumulate */);
+            anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
+         }
+         break;
+      }
+#endif
+
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+         if (!write_results)
+            break;
+         const void *query_data = query_slot(pool, firstQuery + i);
+         const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
+         struct intel_perf_query_result result;
+         intel_perf_query_result_clear(&result);
+         intel_perf_query_result_accumulate_fields(&result, query,
+                                                   query_data + intel_perf_query_data_offset(pool, false),
+                                                   query_data + intel_perf_query_data_offset(pool, true),
+                                                   false /* no_oa_accumulate */);
+         intel_perf_query_result_write_mdapi(pData, stride,
+                                             device->info,
+                                             query, &result);
+         const uint64_t *marker = query_data + intel_perf_marker_offset();
+         intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker);
+         break;
+      }
+
+      default:
+         unreachable("invalid pool type");
+      }
+
+      if (!write_results)
+         status = VK_NOT_READY;
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
+         cpu_write_query_result(pData, flags, idx, available);
+
+      pData += stride;
+      if (pData >= data_end)
+         break;
+   }
+
+   return status;
+}
+
+static void
+emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_address addr)
+{
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DestinationAddressType  = DAT_PPGTT;
+      pc.PostSyncOperation       = WritePSDepthCount;
+      pc.DepthStallEnable        = true;
+      pc.Address                 = addr;
+
+      if (GFX_VER == 9 && cmd_buffer->device->info->gt == 4)
+         pc.CommandStreamerStallEnable = true;
+   }
+}
+
+static void
+emit_query_mi_availability(struct mi_builder *b,
+                           struct anv_address addr,
+                           bool available)
+{
+   mi_store(b, mi_mem64(addr), mi_imm(available));
+}
+
+static void
+emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address addr,
+                           bool available)
+{
+   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DestinationAddressType  = DAT_PPGTT;
+      pc.PostSyncOperation       = WriteImmediateData;
+      pc.Address                 = addr;
+      pc.ImmediateData           = available;
+   }
+}
+
+/**
+ * Goes through a series of consecutive query indices in the given pool
+ * setting all element values to 0 and emitting them as available.
+ */
+static void
+emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
+                  struct mi_builder *b, struct anv_query_pool *pool,
+                  uint32_t first_index, uint32_t num_queries)
+{
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      /* These queries are written with a PIPE_CONTROL so clear them using the
+       * PIPE_CONTROL as well so we don't have to synchronize between 2 types
+       * of operations.
+       */
+      assert((pool->stride % 8) == 0);
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+
+         for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
+            emit_query_pc_availability(cmd_buffer,
+                                       anv_address_add(slot_addr, qword * 8),
+                                       false);
+         }
+         emit_query_pc_availability(cmd_buffer, slot_addr, true);
+      }
+      break;
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
+         emit_query_mi_availability(b, slot_addr, true);
+      }
+      break;
+
+#if GFX_VER >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      for (uint32_t i = 0; i < num_queries; i++) {
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
+                         0, 2 * pool->snapshot_size);
+            emit_query_mi_availability(b,
+                                       khr_perf_query_availability_address(pool, first_index + i, p),
+                                       true);
+         }
+      }
+      break;
+   }
+#endif
+
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+         mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
+         emit_query_mi_availability(b, slot_addr, true);
+      }
+      break;
+
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+void genX(CmdResetQueryPool)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_pc_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
+      }
+      break;
+
+   case VK_QUERY_TYPE_TIMESTAMP: {
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_pc_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
+      }
+
+      /* Add a CS stall here to make sure the PIPE_CONTROL above has
+       * completed. Otherwise some timestamps written later with MI_STORE_*
+       * commands might race with the PIPE_CONTROL in the loop above.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
+                                "vkCmdResetQueryPool of timestamps");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      break;
+   }
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++)
+         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
+      break;
+   }
+
+#if GFX_VER >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++) {
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            emit_query_mi_availability(
+               &b,
+               khr_perf_query_availability_address(pool, firstQuery + i, p),
+               false);
+         }
+      }
+      break;
+   }
+#endif
+
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++)
+         emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
+      break;
+   }
+
+   default:
+      unreachable("Unsupported query type");
+   }
+}
+
+void genX(ResetQueryPool)(
+    VkDevice                                    _device,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount)
+{
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   for (uint32_t i = 0; i < queryCount; i++) {
+      if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+#if GFX_VER >= 8
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            uint64_t *pass_slot = pool->bo->map +
+               khr_perf_query_availability_offset(pool, firstQuery + i, p);
+            *pass_slot = 0;
+         }
+#endif
+      } else {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         *slot = 0;
+      }
+   }
+}
+
+static const uint32_t vk_pipeline_stat_to_reg[] = {
+   GENX(IA_VERTICES_COUNT_num),
+   GENX(IA_PRIMITIVES_COUNT_num),
+   GENX(VS_INVOCATION_COUNT_num),
+   GENX(GS_INVOCATION_COUNT_num),
+   GENX(GS_PRIMITIVES_COUNT_num),
+   GENX(CL_INVOCATION_COUNT_num),
+   GENX(CL_PRIMITIVES_COUNT_num),
+   GENX(PS_INVOCATION_COUNT_num),
+   GENX(HS_INVOCATION_COUNT_num),
+   GENX(DS_INVOCATION_COUNT_num),
+   GENX(CS_INVOCATION_COUNT_num),
+};
+
+static void
+emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
+                   struct anv_address addr)
+{
+   STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
+                 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
+
+   assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
+   mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
+}
+
+static void
+emit_xfb_query(struct mi_builder *b, uint32_t stream,
+               struct anv_address addr)
+{
+   assert(stream < MAX_XFB_STREAMS);
+
+   mi_store(b, mi_mem64(anv_address_add(addr, 0)),
+               mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
+   mi_store(b, mi_mem64(anv_address_add(addr, 16)),
+               mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
+}
+
+static void
+emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
+                      struct anv_query_pool *pool,
+                      struct mi_builder *b,
+                      struct anv_address query_addr,
+                      bool end)
+{
+   const struct intel_perf_query_field_layout *layout =
+      &cmd_buffer->device->physical->perf->query_layout;
+   struct anv_address data_addr =
+      anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
+
+   for (uint32_t f = 0; f < layout->n_fields; f++) {
+      const struct intel_perf_query_field *field =
+         &layout->fields[end ? f : (layout->n_fields - 1 - f)];
+
+      switch (field->type) {
+      case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
+            rpc.MemoryAddress = anv_address_add(data_addr, field->location);
+         }
+         break;
+
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
+         struct anv_address addr = anv_address_add(data_addr, field->location);
+         struct mi_value src = field->size == 8 ?
+            mi_reg64(field->mmio_offset) :
+            mi_reg32(field->mmio_offset);
+         struct mi_value dst = field->size == 8 ?
+            mi_mem64(addr) : mi_mem32(addr);
+         mi_store(b, dst, src);
+         break;
+      }
+
+      default:
+         unreachable("Invalid query field");
+         break;
+      }
+   }
+}
+
+void genX(CmdBeginQuery)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    VkQueryControlFlags                         flags)
+{
+   genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
+}
+
+void genX(CmdBeginQueryIndexedEXT)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    VkQueryControlFlags                         flags,
+    uint32_t                                    index)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   struct anv_address query_addr = anv_query_address(pool, query);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
+      break;
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
+                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      /* TODO: This might only be necessary for certain stats */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+
+      uint32_t statistics = pool->pipeline_statistics;
+      uint32_t offset = 8;
+      while (statistics) {
+         uint32_t stat = u_bit_scan(&statistics);
+         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
+         offset += 16;
+      }
+      break;
+   }
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
+      break;
+
+#if GFX_VER >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      if (!khr_perf_query_ensure_relocs(cmd_buffer))
+         return;
+
+      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
+
+      uint32_t reloc_idx = 0;
+      for (uint32_t end = 0; end < 2; end++) {
+         for (uint32_t r = 0; r < layout->n_fields; r++) {
+            const struct intel_perf_query_field *field =
+               &layout->fields[end ? r : (layout->n_fields - 1 - r)];
+            struct mi_value reg_addr =
+               mi_iadd(
+                  &b,
+                  mi_imm(intel_canonical_address(pool->bo->offset +
+                                                 khr_perf_query_data_offset(pool, query, 0, end) +
+                                                 field->location)),
+                  mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+            cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
+
+            if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
+                field->size == 8) {
+               reg_addr =
+                  mi_iadd(
+                     &b,
+                     mi_imm(intel_canonical_address(pool->bo->offset +
+                                                    khr_perf_query_data_offset(pool, query, 0, end) +
+                                                    field->location + 4)),
+                     mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+               cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
+            }
+         }
+      }
+
+      struct mi_value availability_write_offset =
+         mi_iadd(
+            &b,
+            mi_imm(
+               intel_canonical_address(
+                  pool->bo->offset +
+                  khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
+            mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+      cmd_buffer->self_mod_locations[reloc_idx++] =
+         mi_store_address(&b, availability_write_offset);
+
+      assert(reloc_idx == pdevice->n_perf_query_commands);
+
+      mi_self_mod_barrier(&b);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      cmd_buffer->perf_query_pool = pool;
+
+      cmd_buffer->perf_reloc_idx = 0;
+      for (uint32_t r = 0; r < layout->n_fields; r++) {
+         const struct intel_perf_query_field *field =
+            &layout->fields[layout->n_fields - 1 - r];
+         void *dws;
+
+         switch (field->type) {
+         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
+            dws = anv_batch_emitn(&cmd_buffer->batch,
+                                  GENX(MI_REPORT_PERF_COUNT_length),
+                                  GENX(MI_REPORT_PERF_COUNT),
+                                  .MemoryAddress = query_addr /* Will be overwritten */);
+            _mi_resolve_address_token(&b,
+                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                      dws +
+                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
+            break;
+
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
+            dws =
+               anv_batch_emitn(&cmd_buffer->batch,
+                               GENX(MI_STORE_REGISTER_MEM_length),
+                               GENX(MI_STORE_REGISTER_MEM),
+                               .RegisterAddress = field->mmio_offset,
+                               .MemoryAddress = query_addr /* Will be overwritten */ );
+            _mi_resolve_address_token(&b,
+                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                      dws +
+                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
+            if (field->size == 8) {
+               dws =
+                  anv_batch_emitn(&cmd_buffer->batch,
+                                  GENX(MI_STORE_REGISTER_MEM_length),
+                                  GENX(MI_STORE_REGISTER_MEM),
+                                  .RegisterAddress = field->mmio_offset + 4,
+                                  .MemoryAddress = query_addr /* Will be overwritten */ );
+               _mi_resolve_address_token(&b,
+                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                         dws +
+                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
+            }
+            break;
+
+         default:
+            unreachable("Invalid query field");
+            break;
+         }
+      }
+      break;
+   }
+#endif
+
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
+      break;
+   }
+
+   default:
+      unreachable("");
+   }
+}
+
+void genX(CmdEndQuery)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query)
+{
+   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
+}
+
+void genX(CmdEndQueryIndexedEXT)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query,
+    uint32_t                                    index)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   struct anv_address query_addr = anv_query_address(pool, query);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   switch (pool->type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+      emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
+      emit_query_pc_availability(cmd_buffer, query_addr, true);
+      break;
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      /* Ensure previous commands have completed before capturing the register
+       * value.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
+                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+      emit_query_mi_availability(&b, query_addr, true);
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+      /* TODO: This might only be necessary for certain stats */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+
+      uint32_t statistics = pool->pipeline_statistics;
+      uint32_t offset = 16;
+      while (statistics) {
+         uint32_t stat = u_bit_scan(&statistics);
+         emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
+         offset += 16;
+      }
+
+      emit_query_mi_availability(&b, query_addr, true);
+      break;
+   }
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+
+      emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
+      emit_query_mi_availability(&b, query_addr, true);
+      break;
+
+#if GFX_VER >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      cmd_buffer->perf_query_pool = pool;
+
+      if (!khr_perf_query_ensure_relocs(cmd_buffer))
+         return;
+
+      const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+      const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
+
+      void *dws;
+      for (uint32_t r = 0; r < layout->n_fields; r++) {
+         const struct intel_perf_query_field *field = &layout->fields[r];
+
+         switch (field->type) {
+         case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
+            dws = anv_batch_emitn(&cmd_buffer->batch,
+                                  GENX(MI_REPORT_PERF_COUNT_length),
+                                  GENX(MI_REPORT_PERF_COUNT),
+                                  .MemoryAddress = query_addr /* Will be overwritten */);
+            _mi_resolve_address_token(&b,
+                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                      dws +
+                                      GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
+            break;
+
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
+            dws =
+               anv_batch_emitn(&cmd_buffer->batch,
+                               GENX(MI_STORE_REGISTER_MEM_length),
+                               GENX(MI_STORE_REGISTER_MEM),
+                               .RegisterAddress = field->mmio_offset,
+                               .MemoryAddress = query_addr /* Will be overwritten */ );
+            _mi_resolve_address_token(&b,
+                                      cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                      dws +
+                                      GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
+            if (field->size == 8) {
+               dws =
+                  anv_batch_emitn(&cmd_buffer->batch,
+                                  GENX(MI_STORE_REGISTER_MEM_length),
+                                  GENX(MI_STORE_REGISTER_MEM),
+                                  .RegisterAddress = field->mmio_offset + 4,
+                                  .MemoryAddress = query_addr /* Will be overwritten */ );
+               _mi_resolve_address_token(&b,
+                                         cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                         dws +
+                                         GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
+            }
+            break;
+
+         default:
+            unreachable("Invalid query field");
+            break;
+         }
+      }
+
+      dws =
+         anv_batch_emitn(&cmd_buffer->batch,
+                         GENX(MI_STORE_DATA_IMM_length),
+                         GENX(MI_STORE_DATA_IMM),
+                         .ImmediateData = true);
+      _mi_resolve_address_token(&b,
+                                cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
+                                dws +
+                                GENX(MI_STORE_DATA_IMM_Address_start) / 8);
+
+      assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
+      break;
+   }
+#endif
+
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      uint32_t marker_offset = intel_perf_marker_offset();
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
+                   mi_imm(cmd_buffer->intel_perf_marker));
+      emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
+      emit_query_mi_availability(&b, query_addr, true);
+      break;
+   }
+
+   default:
+      unreachable("");
+   }
+
+   /* When multiview is active the spec requires that N consecutive query
+    * indices are used, where N is the number of active views in the subpass.
+    * The spec allows that we only write the results to one of the queries
+    * but we still need to manage result availability for all the query indices.
+    * Since we only emit a single query for all active views in the
+    * first index, mark the other query indices as being already available
+    * with result 0.
+    */
+   if (cmd_buffer->state.gfx.view_mask) {
+      const uint32_t num_queries =
+         util_bitcount(cmd_buffer->state.gfx.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
+   }
+}
+
+#define TIMESTAMP 0x2358
+
+void genX(CmdWriteTimestamp2)(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlags2                       stage,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    query)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   struct anv_address query_addr = anv_query_address(pool, query);
+
+   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
+                   mi_reg64(TIMESTAMP));
+      emit_query_mi_availability(&b, query_addr, true);
+   } else {
+      /* Everything else is bottom-of-pipe */
+      cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.DestinationAddressType  = DAT_PPGTT;
+         pc.PostSyncOperation       = WriteTimestamp;
+         pc.Address                 = anv_address_add(query_addr, 8);
+
+         if (GFX_VER == 9 && cmd_buffer->device->info->gt == 4)
+            pc.CommandStreamerStallEnable = true;
+      }
+      emit_query_pc_availability(cmd_buffer, query_addr, true);
+   }
+
+
+   /* When multiview is active the spec requires that N consecutive query
+    * indices are used, where N is the number of active views in the subpass.
+    * The spec allows that we only write the results to one of the queries
+    * but we still need to manage result availability for all the query indices.
+    * Since we only emit a single query for all active views in the
+    * first index, mark the other query indices as being already available
+    * with result 0.
+    */
+   if (cmd_buffer->state.gfx.view_mask) {
+      const uint32_t num_queries =
+         util_bitcount(cmd_buffer->state.gfx.view_mask);
+      if (num_queries > 1)
+         emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
+   }
+}
+
+#if GFX_VERx10 >= 75
+
+#define MI_PREDICATE_SRC0    0x2400
+#define MI_PREDICATE_SRC1    0x2408
+#define MI_PREDICATE_RESULT  0x2418
+
+/**
+ * Writes the results of a query to dst_addr is the value at poll_addr is equal
+ * to the reference value.
+ */
+static void
+gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
+                            struct mi_builder *b,
+                            struct anv_address poll_addr,
+                            struct anv_address dst_addr,
+                            uint64_t ref_value,
+                            VkQueryResultFlags flags,
+                            uint32_t value_index,
+                            struct mi_value query_result)
+{
+   mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
+   mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+      mip.LoadOperation    = LOAD_LOAD;
+      mip.CombineOperation = COMBINE_SET;
+      mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   }
+
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
+      mi_store_if(b, mi_mem64(res_addr), query_result);
+   } else {
+      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
+      mi_store_if(b, mi_mem32(res_addr), query_result);
+   }
+}
+
+static void
+gpu_write_query_result(struct mi_builder *b,
+                       struct anv_address dst_addr,
+                       VkQueryResultFlags flags,
+                       uint32_t value_index,
+                       struct mi_value query_result)
+{
+   if (flags & VK_QUERY_RESULT_64_BIT) {
+      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
+      mi_store(b, mi_mem64(res_addr), query_result);
+   } else {
+      struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
+      mi_store(b, mi_mem32(res_addr), query_result);
+   }
+}
+
+static struct mi_value
+compute_query_result(struct mi_builder *b, struct anv_address addr)
+{
+   return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
+                     mi_mem64(anv_address_add(addr, 0)));
+}
+
+void genX(CmdCopyQueryPoolResults)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    destBuffer,
+    VkDeviceSize                                destOffset,
+    VkDeviceSize                                destStride,
+    VkQueryResultFlags                          flags)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   struct mi_value result;
+
+   /* If render target writes are ongoing, request a render target cache flush
+    * to ensure proper ordering of the commands from the 3d pipe and the
+    * command streamer.
+    */
+   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
+                                "CopyQueryPoolResults");
+   }
+
+   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
+       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
+       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+        * because we're about to copy values from MI commands, we need to
+        * stall the command streamer to make sure the PIPE_CONTROL values have
+        * landed, otherwise we could see inconsistent values & availability.
+        *
+        *  From the vulkan spec:
+        *
+        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+        *     previous uses of vkCmdResetQueryPool in the same queue, without
+        *     any additional synchronization."
+        */
+       pool->type == VK_QUERY_TYPE_OCCLUSION ||
+       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CS_STALL_BIT,
+                                "CopyQueryPoolResults");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   }
+
+   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
+   for (uint32_t i = 0; i < queryCount; i++) {
+      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
+      uint32_t idx = 0;
+      switch (pool->type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+         result = compute_query_result(&b, anv_address_add(query_addr, 8));
+         /* Like in the case of vkGetQueryPoolResults, if the query is
+          * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
+          * conservatively write 0 as the query result. If the
+          * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
+          */
+         gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
+               1 /* available */, flags, idx, result);
+         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
+            gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
+                  0 /* unavailable */, flags, idx, mi_imm(0));
+         }
+         idx++;
+         break;
+
+      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+         uint32_t statistics = pool->pipeline_statistics;
+         while (statistics) {
+            uint32_t stat = u_bit_scan(&statistics);
+
+            result = compute_query_result(&b, anv_address_add(query_addr,
+                                                              idx * 16 + 8));
+
+            /* WaDividePSInvocationCountBy4:HSW,BDW */
+            if ((cmd_buffer->device->info->ver == 8 ||
+                 cmd_buffer->device->info->verx10 == 75) &&
+                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
+               result = mi_ushr32_imm(&b, result, 2);
+            }
+
+            gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         }
+         assert(idx == util_bitcount(pool->pipeline_statistics));
+         break;
+      }
+
+      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+         result = compute_query_result(&b, anv_address_add(query_addr, 8));
+         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         result = compute_query_result(&b, anv_address_add(query_addr, 24));
+         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         break;
+
+      case VK_QUERY_TYPE_TIMESTAMP:
+         result = mi_mem64(anv_address_add(query_addr, 8));
+         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         break;
+
+#if GFX_VER >= 8
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+         unreachable("Copy KHR performance query results not implemented");
+         break;
+#endif
+
+      default:
+         unreachable("unhandled query type");
+      }
+
+      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
+         gpu_write_query_result(&b, dest_addr, flags, idx,
+                                mi_mem64(query_addr));
+      }
+
+      dest_addr = anv_address_add(dest_addr, destStride);
+   }
+}
+
+#else
+void genX(CmdCopyQueryPoolResults)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    destBuffer,
+    VkDeviceSize                                destOffset,
+    VkDeviceSize                                destStride,
+    VkQueryResultFlags                          flags)
+{
+   anv_finishme("Queries not yet supported on Ivy Bridge");
+}
+#endif
diff --git a/src/intel/vulkan_hasvk/genX_state.c b/src/intel/vulkan_hasvk/genX_state.c
new file mode 100644
index 00000000000..b568960907d
--- /dev/null
+++ b/src/intel/vulkan_hasvk/genX_state.c
@@ -0,0 +1,1141 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "common/intel_aux_map.h"
+#include "common/intel_sample_positions.h"
+#include "common/intel_pixel_hash.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "vk_standard_sample_locations.h"
+#include "vk_util.h"
+
+static void
+genX(emit_slice_hashing_state)(struct anv_device *device,
+                               struct anv_batch *batch)
+{
+#if GFX_VER == 11
+   /* Gfx11 hardware has two pixel pipes at most. */
+   for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
+      assert(device->info->ppipe_subslices[i] == 0);
+
+   if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
+     return;
+
+   if (!device->slice_hash.alloc_size) {
+      unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+      device->slice_hash =
+         anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+      const bool flip = device->info->ppipe_subslices[0] <
+                     device->info->ppipe_subslices[1];
+      struct GENX(SLICE_HASH_TABLE) table;
+      intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
+
+      GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+      ptr.SliceHashStatePointerValid = true;
+      ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+      mode.SliceHashingTableEnable = true;
+   }
+#elif GFX_VERx10 == 120
+   /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
+    * present with n active dual subslices.
+    */
+   unsigned ppipes_of[3] = {};
+
+   for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
+      for (unsigned p = 0; p < 3; p++)
+         ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
+   }
+
+   /* Gfx12 has three pixel pipes. */
+   for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
+      assert(device->info->ppipe_subslices[p] == 0);
+
+   if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
+      /* All three pixel pipes have the maximum number of active dual
+       * subslices, or there is only one active pixel pipe: Nothing to do.
+       */
+      return;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
+      p.SliceHashControl[0] = TABLE_0;
+
+      if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
+      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
+
+      if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
+      else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
+      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
+      else
+         unreachable("Illegal fusing.");
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
+      p.SubsliceHashingTableEnable = true;
+      p.SubsliceHashingTableEnableMask = true;
+   }
+#elif GFX_VERx10 == 125
+   uint32_t ppipe_mask = 0;
+   for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
+      if (device->info->ppipe_subslices[p])
+         ppipe_mask |= (1u << p);
+   }
+   assert(ppipe_mask);
+
+   if (!device->slice_hash.alloc_size) {
+      unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+      device->slice_hash =
+         anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+      struct GENX(SLICE_HASH_TABLE) table;
+
+      /* Note that the hardware expects an array with 7 tables, each
+       * table is intended to specify the pixel pipe hashing behavior
+       * for every possible slice count between 2 and 8, however that
+       * doesn't actually work, among other reasons due to hardware
+       * bugs that will cause the GPU to erroneously access the table
+       * at the wrong index in some cases, so in practice all 7 tables
+       * need to be initialized to the same value.
+       */
+      for (unsigned i = 0; i < 7; i++)
+         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask, table.Entry[i][0]);
+
+      GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+      ptr.SliceHashStatePointerValid = true;
+      ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+      mode.SliceHashingTableEnable = true;
+      mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask) > 1 ?
+				    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
+   }
+#endif
+}
+
+static void
+init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
+{
+   UNUSED struct anv_device *device = queue->device;
+
+#if GFX_VER >= 11
+   /* Starting with GFX version 11, SLM is no longer part of the L3$ config
+    * so it never changes throughout the lifetime of the VkDevice.
+    */
+   const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+   genX(emit_l3_config)(batch, device, cfg);
+   device->l3_config = cfg;
+#endif
+
+#if GFX_VERx10 >= 125
+   /* GEN:BUG:1607854226:
+    *
+    *  Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
+    *  Fortunately, we always start the context off in 3D mode.
+    */
+   uint32_t mocs = device->isl_dev.mocs.internal;
+   anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
+      sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.GeneralStateBufferSize  = 0xfffff;
+      sba.GeneralStateMOCS = mocs;
+      sba.GeneralStateBaseAddressModifyEnable = true;
+      sba.GeneralStateBufferSizeModifyEnable = true;
+
+      sba.StatelessDataPortAccessMOCS = mocs;
+
+      sba.SurfaceStateBaseAddress =
+         (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS };
+      sba.SurfaceStateMOCS = mocs;
+      sba.SurfaceStateBaseAddressModifyEnable = true;
+
+      sba.DynamicStateBaseAddress =
+         (struct anv_address) { .offset = DYNAMIC_STATE_POOL_MIN_ADDRESS };
+      sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
+      sba.DynamicStateMOCS = mocs;
+      sba.DynamicStateBaseAddressModifyEnable = true;
+      sba.DynamicStateBufferSizeModifyEnable = true;
+
+      sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.IndirectObjectBufferSize = 0xfffff;
+      sba.IndirectObjectMOCS = mocs;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSizeModifyEnable = true;
+
+      sba.InstructionBaseAddress =
+         (struct anv_address) { .offset = INSTRUCTION_STATE_POOL_MIN_ADDRESS };
+      sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
+      sba.InstructionMOCS = mocs;
+      sba.InstructionBaseAddressModifyEnable = true;
+      sba.InstructionBuffersizeModifyEnable = true;
+
+      sba.BindlessSurfaceStateBaseAddress =
+         (struct anv_address) { .offset = SURFACE_STATE_POOL_MIN_ADDRESS };
+      sba.BindlessSurfaceStateSize = (1 << 20) - 1;
+      sba.BindlessSurfaceStateMOCS = mocs;
+      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+
+      sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.BindlessSamplerStateMOCS = mocs;
+      sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+      sba.BindlessSamplerStateBufferSize = 0;
+
+      sba.L1CacheControl = L1CC_WB;
+   }
+#endif
+}
+
+static VkResult
+init_render_queue_state(struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   uint32_t cmds[128];
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *) cmds + sizeof(cmds),
+   };
+
+   anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
+#if GFX_VER >= 9
+      ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
+      ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
+#endif
+      ps.PipelineSelection = _3D;
+   }
+
+#if GFX_VER == 9
+   anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
+      cm1.FloatBlendOptimizationEnable = true;
+      cm1.FloatBlendOptimizationEnableMask = true;
+      cm1.MSCRAWHazardAvoidanceBit = true;
+      cm1.MSCRAWHazardAvoidanceBitMask = true;
+      cm1.PartialResolveDisableInVC = true;
+      cm1.PartialResolveDisableInVCMask = true;
+   }
+#endif
+
+   anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
+
+   anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+      rect.ClippedDrawingRectangleYMin = 0;
+      rect.ClippedDrawingRectangleXMin = 0;
+      rect.ClippedDrawingRectangleYMax = UINT16_MAX;
+      rect.ClippedDrawingRectangleXMax = UINT16_MAX;
+      rect.DrawingRectangleOriginY = 0;
+      rect.DrawingRectangleOriginX = 0;
+   }
+
+#if GFX_VER >= 8
+   anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
+
+   genX(emit_sample_pattern)(&batch, NULL);
+
+   /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
+    * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
+    * Clear." It mentions that the packet overrides GPU state for the clear
+    * operation and needs to be reset to 0s to clear the overrides. Depending
+    * on the kernel, we may not get a context with the state for this packet
+    * zeroed. Do it ourselves just in case. We've observed this to prevent a
+    * number of GPU hangs on ICL.
+    */
+   anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
+#endif
+
+#if GFX_VER == 11
+   /* The default behavior of bit 5 "Headerless Message for Pre-emptable
+    * Contexts" in SAMPLER MODE register is set to 0, which means
+    * headerless sampler messages are not allowed for pre-emptable
+    * contexts. Set the bit 5 to 1 to allow them.
+    */
+   anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
+      sm.HeaderlessMessageforPreemptableContexts = true;
+      sm.HeaderlessMessageforPreemptableContextsMask = true;
+   }
+
+   /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
+    * HALF_SLICE_CHICKEN7 register.
+    */
+   anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
+      hsc7.EnabledTexelOffsetPrecisionFix = true;
+      hsc7.EnabledTexelOffsetPrecisionFixMask = true;
+   }
+
+   anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
+      tcc.L3DataPartialWriteMergingEnable = true;
+      tcc.ColorZPartialWriteMergingEnable = true;
+      tcc.URBPartialWriteMergingEnable = true;
+      tcc.TCDisable = true;
+   }
+#endif
+   genX(emit_slice_hashing_state)(device, &batch);
+
+#if GFX_VER >= 11
+   /* hardware specification recommends disabling repacking for
+    * the compatibility with decompression mechanism in display controller.
+    */
+   if (device->info->disable_ccs_repack) {
+      anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
+         cm0.DisableRepackingforCompression = true;
+         cm0.DisableRepackingforCompressionMask = true;
+      }
+   }
+
+   /* an unknown issue is causing vs push constants to become
+    * corrupted during object-level preemption. For now, restrict
+    * to command buffer level preemption to avoid rendering
+    * corruption.
+    */
+   anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
+      cc1.ReplayMode = MidcmdbufferPreemption;
+      cc1.ReplayModeMask = true;
+
+#if GFX_VERx10 == 120
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
+#endif
+   }
+
+#if GFX_VERx10 == 120
+   /* Wa_1806527549 says to disable the following HiZ optimization when the
+    * depth buffer is D16_UNORM. We've found the WA to help with more depth
+    * buffer configurations however, so we always disable it just to be safe.
+    */
+   anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
+      reg.HZDepthTestLEGEOptimizationDisable = true;
+      reg.HZDepthTestLEGEOptimizationDisableMask = true;
+   }
+#endif
+
+#if GFX_VERx10 < 125
+#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
+#else
+#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
+#endif
+
+   /* Enable the new line drawing algorithm that produces higher quality
+    * lines.
+    */
+   anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
+      c3.AALineQualityFix = true;
+      c3.AALineQualityFixMask = true;
+   }
+#endif
+
+#if GFX_VER == 12
+   if (device->info->has_aux_map) {
+      uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
+      assert(aux_base_addr % (32 * 1024) == 0);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
+         lri.DataDWord = aux_base_addr & 0xffffffff;
+      }
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
+         lri.DataDWord = aux_base_addr >> 32;
+      }
+   }
+#endif
+
+   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
+    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
+    *
+    * This is only safe on kernels with context isolation support.
+    */
+   if (GFX_VER >= 8 && device->physical->has_context_isolation) {
+#if GFX_VER >= 9
+      anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
+         csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
+         csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+      }
+#elif GFX_VER == 8
+      anv_batch_write_reg(&batch, GENX(INSTPM), instpm) {
+         instpm.CONSTANT_BUFFERAddressOffsetDisable = true;
+         instpm.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+      }
+#endif
+   }
+
+   init_common_queue_state(queue, &batch);
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   return anv_queue_submit_simple_batch(queue, &batch);
+}
+
+static VkResult
+init_compute_queue_state(struct anv_queue *queue)
+{
+   struct anv_batch batch;
+
+   uint32_t cmds[64];
+   batch.start = batch.next = cmds;
+   batch.end = (void *) cmds + sizeof(cmds);
+
+   anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
+#if GFX_VER >= 9
+      ps.MaskBits = 3;
+#endif
+#if GFX_VER >= 11
+      ps.MaskBits |= 0x10;
+      ps.MediaSamplerDOPClockGateEnable = true;
+#endif
+      ps.PipelineSelection = GPGPU;
+   }
+
+   init_common_queue_state(queue, &batch);
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   return anv_queue_submit_simple_batch(queue, &batch);
+}
+
+void
+genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
+{
+   assert(pdevice->info.verx10 == GFX_VERx10);
+}
+
+VkResult
+genX(init_device_state)(struct anv_device *device)
+{
+   VkResult res;
+
+   device->slice_hash = (struct anv_state) { 0 };
+   for (uint32_t i = 0; i < device->queue_count; i++) {
+      struct anv_queue *queue = &device->queues[i];
+      switch (queue->family->engine_class) {
+      case I915_ENGINE_CLASS_RENDER:
+         res = init_render_queue_state(queue);
+         break;
+      case I915_ENGINE_CLASS_COMPUTE:
+         res = init_compute_queue_state(queue);
+         break;
+      default:
+         res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+         break;
+      }
+      if (res != VK_SUCCESS)
+         return res;
+   }
+
+   return res;
+}
+
+#if GFX_VERx10 >= 125
+#define maybe_for_each_shading_rate_op(name) \
+   for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
+        name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
+        name++)
+#elif GFX_VER >= 12
+#define maybe_for_each_shading_rate_op(name)
+#endif
+
+/* Rather than reemitting the CPS_STATE structure everything those changes and
+ * for as many viewports as needed, we can just prepare all possible cases and
+ * just pick the right offset from the prepacked states when needed.
+ */
+void
+genX(init_cps_device_state)(struct anv_device *device)
+{
+#if GFX_VER >= 12
+   void *cps_state_ptr = device->cps_states.map;
+
+   /* Disabled CPS mode */
+   for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+      struct GENX(CPS_STATE) cps_state = {
+         .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
+         .MinCPSizeX = 1,
+         .MinCPSizeY = 1,
+#if GFX_VERx10 >= 125
+         .Combiner0OpcodeforCPsize = PASSTHROUGH,
+         .Combiner1OpcodeforCPsize = PASSTHROUGH,
+#endif /* GFX_VERx10 >= 125 */
+
+      };
+
+      GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+      cps_state_ptr += GENX(CPS_STATE_length) * 4;
+   }
+
+   maybe_for_each_shading_rate_op(op0) {
+      maybe_for_each_shading_rate_op(op1) {
+         for (uint32_t x = 1; x <= 4; x *= 2) {
+            for (uint32_t y = 1; y <= 4; y *= 2) {
+               struct GENX(CPS_STATE) cps_state = {
+                  .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
+                  .MinCPSizeX = x,
+                  .MinCPSizeY = y,
+               };
+
+#if GFX_VERx10 >= 125
+               static const uint32_t combiner_ops[] = {
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR]    = PASSTHROUGH,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR]     = HIGH_QUALITY,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR]     = LOW_QUALITY,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR]     = RELATIVE,
+               };
+
+               cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
+               cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
+#endif /* GFX_VERx10 >= 125 */
+
+               for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+                  GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+                  cps_state_ptr += GENX(CPS_STATE_length) * 4;
+               }
+            }
+         }
+      }
+   }
+#endif /* GFX_VER >= 12 */
+}
+
+#if GFX_VER >= 12
+static uint32_t
+get_cps_state_offset(struct anv_device *device, bool cps_enabled,
+                     const struct vk_fragment_shading_rate_state *fsr)
+{
+   if (!cps_enabled)
+      return device->cps_states.offset;
+
+   uint32_t offset;
+   static const uint32_t size_index[] = {
+      [1] = 0,
+      [2] = 1,
+      [4] = 2,
+   };
+
+#if GFX_VERx10 >= 125
+   offset =
+      1 + /* skip disabled */
+      fsr->combiner_ops[0] * 5 * 3 * 3 +
+      fsr->combiner_ops[1] * 3 * 3 +
+      size_index[fsr->fragment_size.width] * 3 +
+      size_index[fsr->fragment_size.height];
+#else
+   offset =
+      1 + /* skip disabled */
+      size_index[fsr->fragment_size.width] * 3 +
+      size_index[fsr->fragment_size.height];
+#endif
+
+   offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
+
+   return device->cps_states.offset + offset;
+}
+#endif /* GFX_VER >= 12 */
+
+void
+genX(emit_l3_config)(struct anv_batch *batch,
+                     const struct anv_device *device,
+                     const struct intel_l3_config *cfg)
+{
+   UNUSED const struct intel_device_info *devinfo = device->info;
+
+#if GFX_VER >= 8
+
+#if GFX_VER >= 12
+#define L3_ALLOCATION_REG GENX(L3ALLOC)
+#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
+#else
+#define L3_ALLOCATION_REG GENX(L3CNTLREG)
+#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
+#endif
+
+   anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
+      if (cfg == NULL) {
+#if GFX_VER >= 12
+         l3cr.L3FullWayAllocationEnable = true;
+#else
+         unreachable("Invalid L3$ config");
+#endif
+      } else {
+#if GFX_VER < 11
+         l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
+#endif
+#if GFX_VER == 11
+         /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
+          * set in L3CNTLREG register. The default setting of the bit is not
+          * the desirable behavior.
+          */
+         l3cr.ErrorDetectionBehaviorControl = true;
+         l3cr.UseFullWays = true;
+#endif /* GFX_VER == 11 */
+         assert(cfg->n[INTEL_L3P_IS] == 0);
+         assert(cfg->n[INTEL_L3P_C] == 0);
+         assert(cfg->n[INTEL_L3P_T] == 0);
+         l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
+         l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
+         l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
+         l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
+      }
+   }
+
+#else /* GFX_VER < 8 */
+
+   const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
+   const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
+                       cfg->n[INTEL_L3P_ALL];
+   const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
+                      cfg->n[INTEL_L3P_ALL];
+   const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
+                      cfg->n[INTEL_L3P_ALL];
+
+   assert(!cfg->n[INTEL_L3P_ALL]);
+
+   /* When enabled SLM only uses a portion of the L3 on half of the banks,
+    * the matching space on the remaining banks has to be allocated to a
+    * client (URB for all validated configurations) set to the
+    * lower-bandwidth 2-bank address hashing mode.
+    */
+   const bool urb_low_bw = cfg->n[INTEL_L3P_SLM] && devinfo->platform != INTEL_PLATFORM_BYT;
+   assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
+
+   /* Minimum number of ways that can be allocated to the URB. */
+   const unsigned n0_urb = devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0;
+   assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
+
+   anv_batch_write_reg(batch, GENX(L3SQCREG1), l3sqc) {
+      l3sqc.ConvertDC_UC = !has_dc;
+      l3sqc.ConvertIS_UC = !has_is;
+      l3sqc.ConvertC_UC = !has_c;
+      l3sqc.ConvertT_UC = !has_t;
+#if GFX_VERx10 == 75
+      l3sqc.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
+#else
+      l3sqc.L3SQGeneralPriorityCreditInitialization =
+         devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
+#endif
+      l3sqc.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
+   }
+
+   anv_batch_write_reg(batch, GENX(L3CNTLREG2), l3cr2) {
+      l3cr2.SLMEnable = cfg->n[INTEL_L3P_SLM];
+      l3cr2.URBLowBandwidth = urb_low_bw;
+      l3cr2.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
+#if !GFX_VERx10 == 75
+      l3cr2.ALLAllocation = cfg->n[INTEL_L3P_ALL];
+#endif
+      l3cr2.ROAllocation = cfg->n[INTEL_L3P_RO];
+      l3cr2.DCAllocation = cfg->n[INTEL_L3P_DC];
+   }
+
+   anv_batch_write_reg(batch, GENX(L3CNTLREG3), l3cr3) {
+      l3cr3.ISAllocation = cfg->n[INTEL_L3P_IS];
+      l3cr3.ISLowBandwidth = 0;
+      l3cr3.CAllocation = cfg->n[INTEL_L3P_C];
+      l3cr3.CLowBandwidth = 0;
+      l3cr3.TAllocation = cfg->n[INTEL_L3P_T];
+      l3cr3.TLowBandwidth = 0;
+   }
+
+#if GFX_VERx10 == 75
+   if (device->physical->cmd_parser_version >= 4) {
+      /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
+       * them disabled to avoid crashing the system hard.
+       */
+      anv_batch_write_reg(batch, GENX(SCRATCH1), s1) {
+         s1.L3AtomicDisable = !has_dc;
+      }
+      anv_batch_write_reg(batch, GENX(CHICKEN3), c3) {
+         c3.L3AtomicDisableMask = true;
+         c3.L3AtomicDisable = !has_dc;
+      }
+   }
+#endif /* GFX_VERx10 == 75 */
+
+#endif /* GFX_VER < 8 */
+}
+
+void
+genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
+                       const struct vk_sample_locations_state *sl)
+{
+   if (sl != NULL) {
+      assert(sl->per_pixel == samples);
+      assert(sl->grid_size.width == 1);
+      assert(sl->grid_size.height == 1);
+   } else {
+      sl = vk_standard_sample_locations_state(samples);
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+      ms.NumberofMultisamples       = __builtin_ffs(samples) - 1;
+
+      ms.PixelLocation              = CENTER;
+#if GFX_VER >= 8
+      /* The PRM says that this bit is valid only for DX9:
+       *
+       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
+       *    should not have any effect by setting or not setting this bit.
+       */
+      ms.PixelPositionOffsetEnable  = false;
+#else
+      switch (samples) {
+      case 1:
+         INTEL_SAMPLE_POS_1X_ARRAY(ms.Sample, sl->locations);
+         break;
+      case 2:
+         INTEL_SAMPLE_POS_2X_ARRAY(ms.Sample, sl->locations);
+         break;
+      case 4:
+         INTEL_SAMPLE_POS_4X_ARRAY(ms.Sample, sl->locations);
+         break;
+      case 8:
+         INTEL_SAMPLE_POS_8X_ARRAY(ms.Sample, sl->locations);
+         break;
+      default:
+            break;
+      }
+#endif
+   }
+}
+
+#if GFX_VER >= 8
+void
+genX(emit_sample_pattern)(struct anv_batch *batch,
+                          const struct vk_sample_locations_state *sl)
+{
+   assert(sl == NULL || sl->grid_size.width == 1);
+   assert(sl == NULL || sl->grid_size.height == 1);
+
+   /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
+    * VkPhysicalDeviceFeatures::standardSampleLocations.
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
+      /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
+       *
+       *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
+       *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
+       *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
+       *    distance from the pixel center. This is required to get the
+       *    correct centroid computation in the device."
+       *
+       * However, the Vulkan spec seems to require that the the samples occur
+       * in the order provided through the API. The standard sample patterns
+       * have the above property that they have monotonically increasing
+       * distances from the center but client-provided ones do not. As long as
+       * this only affects centroid calculations as the docs say, we should be
+       * ok because OpenGL and Vulkan only require that the centroid be some
+       * lit sample and that it's the same for all samples in a pixel; they
+       * have no requirement that it be the one closest to center.
+       */
+      for (uint32_t i = 1; i <= (GFX_VER >= 9 ? 16 : 8); i *= 2) {
+         switch (i) {
+         case VK_SAMPLE_COUNT_1_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_1X(sp._1xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_2_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_2X(sp._2xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_4_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_4X(sp._4xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_8_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_8X(sp._8xSample);
+            }
+            break;
+#if GFX_VER >= 9
+         case VK_SAMPLE_COUNT_16_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_16X(sp._16xSample);
+            }
+            break;
+#endif
+         default:
+            unreachable("Invalid sample count");
+         }
+      }
+   }
+}
+#endif
+
+#if GFX_VER >= 11
+void
+genX(emit_shading_rate)(struct anv_batch *batch,
+                        const struct anv_graphics_pipeline *pipeline,
+                        const struct vk_fragment_shading_rate_state *fsr)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
+
+#if GFX_VER == 11
+   anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
+      cps.CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE;
+      if (cps_enable) {
+         cps.MinCPSizeX = fsr->fragment_size.width;
+         cps.MinCPSizeY = fsr->fragment_size.height;
+      }
+   }
+#elif GFX_VER >= 12
+   /* TODO: we can optimize this flush in the following cases:
+    *
+    *    In the case where the last geometry shader emits a value that is not
+    *    constant, we can avoid this stall because we can synchronize the
+    *    pixel shader internally with
+    *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
+    *
+    *    If we know that the previous pipeline and the current one are using
+    *    the same fragment shading rate.
+    */
+   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VERx10 >= 125
+      pc.PSSStallSyncEnable = true;
+#else
+      pc.PSDSyncEnable = true;
+#endif
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
+      struct anv_device *device = pipeline->base.device;
+
+      cps.CoarsePixelShadingStateArrayPointer =
+         get_cps_state_offset(device, cps_enable, fsr);
+   }
+#endif
+}
+#endif /* GFX_VER >= 11 */
+
+static uint32_t
+vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
+{
+   switch (filter) {
+   default:
+      unreachable("Invalid filter");
+   case VK_FILTER_NEAREST:
+      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
+   case VK_FILTER_LINEAR:
+      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
+   }
+}
+
+static uint32_t
+vk_to_intel_max_anisotropy(float ratio)
+{
+   return (anv_clamp_f(ratio, 2, 16) - 2) / 2;
+}
+
+static const uint32_t vk_to_intel_mipmap_mode[] = {
+   [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
+   [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
+};
+
+static const uint32_t vk_to_intel_tex_address[] = {
+   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
+};
+
+/* Vulkan specifies the result of shadow comparisons as:
+ *     1     if   ref <op> texel,
+ *     0     otherwise.
+ *
+ * The hardware does:
+ *     0     if texel <op> ref,
+ *     1     otherwise.
+ *
+ * So, these look a bit strange because there's both a negation
+ * and swapping of the arguments involved.
+ */
+static const uint32_t vk_to_intel_shadow_compare_op[] = {
+   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
+   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
+   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
+   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
+   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
+   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
+};
+
+#if GFX_VER >= 9
+static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
+   [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
+   [VK_SAMPLER_REDUCTION_MODE_MIN]              = MINIMUM,
+   [VK_SAMPLER_REDUCTION_MODE_MAX]              = MAXIMUM,
+};
+#endif
+
+VkResult genX(CreateSampler)(
+    VkDevice                                    _device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_sampler *sampler;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
+
+   sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
+                              VK_OBJECT_TYPE_SAMPLER);
+   if (!sampler)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   sampler->n_planes = 1;
+
+   uint32_t border_color_stride = GFX_VERx10 == 75 ? 512 : 64;
+   uint32_t border_color_offset;
+   ASSERTED bool has_custom_color = false;
+   if (pCreateInfo->borderColor <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
+      border_color_offset = device->border_colors.offset +
+                            pCreateInfo->borderColor *
+                            border_color_stride;
+   } else {
+      assert(GFX_VER >= 8);
+      sampler->custom_border_color =
+         anv_state_reserved_pool_alloc(&device->custom_border_colors);
+      border_color_offset = sampler->custom_border_color.offset;
+   }
+
+#if GFX_VER >= 9
+   unsigned sampler_reduction_mode = STD_FILTER;
+   bool enable_sampler_reduction = false;
+#endif
+
+   vk_foreach_struct_const(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO: {
+         VkSamplerYcbcrConversionInfo *pSamplerConversion =
+            (VkSamplerYcbcrConversionInfo *) ext;
+         ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion,
+                         pSamplerConversion->conversion);
+
+         /* Ignore conversion for non-YUV formats. This fulfills a requirement
+          * for clients that want to utilize same code path for images with
+          * external formats (VK_FORMAT_UNDEFINED) and "regular" RGBA images
+          * where format is known.
+          */
+         if (conversion == NULL || !conversion->format->can_ycbcr)
+            break;
+
+         sampler->n_planes = conversion->format->n_planes;
+         sampler->conversion = conversion;
+         break;
+      }
+#if GFX_VER >= 9
+      case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: {
+         VkSamplerReductionModeCreateInfo *sampler_reduction =
+            (VkSamplerReductionModeCreateInfo *) ext;
+         sampler_reduction_mode =
+            vk_to_intel_sampler_reduction_mode[sampler_reduction->reductionMode];
+         enable_sampler_reduction = true;
+         break;
+      }
+#endif
+      case VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT: {
+         VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color =
+            (VkSamplerCustomBorderColorCreateInfoEXT *) ext;
+         if (sampler->custom_border_color.map == NULL)
+            break;
+
+         union isl_color_value color = { .u32 = {
+            custom_border_color->customBorderColor.uint32[0],
+            custom_border_color->customBorderColor.uint32[1],
+            custom_border_color->customBorderColor.uint32[2],
+            custom_border_color->customBorderColor.uint32[3],
+         } };
+
+         const struct anv_format *format_desc =
+            custom_border_color->format != VK_FORMAT_UNDEFINED ?
+            anv_get_format(custom_border_color->format) : NULL;
+
+         /* For formats with a swizzle, it does not carry over to the sampler
+          * for border colors, so we need to do the swizzle ourselves here.
+          */
+         if (format_desc && format_desc->n_planes == 1 &&
+             !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
+            const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
+
+            assert(!isl_format_has_int_channel(fmt_plane->isl_format));
+            color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
+         }
+
+         memcpy(sampler->custom_border_color.map, color.u32, sizeof(color));
+         has_custom_color = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_SAMPLER_BORDER_COLOR_COMPONENT_MAPPING_CREATE_INFO_EXT:
+         break;
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+
+   assert((sampler->custom_border_color.map == NULL) || has_custom_color);
+
+   if (device->physical->has_bindless_samplers) {
+      /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
+       * for each sampler instead of 16 bytes because we want all bindless
+       * samplers to be 32-byte aligned so we don't have to use indirect
+       * sampler messages on them.
+       */
+      sampler->bindless_state =
+         anv_state_pool_alloc(&device->dynamic_state_pool,
+                              sampler->n_planes * 32, 32);
+   }
+
+   const bool seamless_cube =
+      !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
+
+   for (unsigned p = 0; p < sampler->n_planes; p++) {
+      const bool plane_has_chroma =
+         sampler->conversion && sampler->conversion->format->planes[p].has_chroma;
+      const VkFilter min_filter =
+         plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->minFilter;
+      const VkFilter mag_filter =
+         plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->magFilter;
+      const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST;
+      const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST;
+      /* From Broadwell PRM, SAMPLER_STATE:
+       *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
+       */
+      const bool isl_format_is_planar_yuv = sampler->conversion &&
+         isl_format_is_yuv(sampler->conversion->format->planes[0].isl_format) &&
+         isl_format_is_planar(sampler->conversion->format->planes[0].isl_format);
+
+      const uint32_t mip_filter_mode =
+         isl_format_is_planar_yuv ?
+         MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
+
+      struct GENX(SAMPLER_STATE) sampler_state = {
+         .SamplerDisable = false,
+         .TextureBorderColorMode = DX10OGL,
+
+#if GFX_VER >= 11
+         .CPSLODCompensationEnable = true,
+#endif
+
+#if GFX_VER >= 8
+         .LODPreClampMode = CLAMP_MODE_OGL,
+#else
+         .LODPreClampEnable = CLAMP_ENABLE_OGL,
+#endif
+
+#if GFX_VER == 8
+         .BaseMipLevel = 0.0,
+#endif
+         .MipModeFilter = mip_filter_mode,
+         .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
+         .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
+         .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996),
+         .AnisotropicAlgorithm =
+            pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
+         .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14),
+         .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14),
+         .ChromaKeyEnable = 0,
+         .ChromaKeyIndex = 0,
+         .ChromaKeyMode = 0,
+         .ShadowFunction =
+            vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
+                                        pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
+         .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
+
+         .BorderColorPointer = border_color_offset,
+
+#if GFX_VER >= 8
+         .LODClampMagnificationMode = MIPNONE,
+#endif
+
+         .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
+         .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .TrilinearFilterQuality = 0,
+         .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
+         .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
+         .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
+         .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
+
+#if GFX_VER >= 9
+         .ReductionType = sampler_reduction_mode,
+         .ReductionTypeEnable = enable_sampler_reduction,
+#endif
+      };
+
+      GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
+
+      if (sampler->bindless_state.map) {
+         memcpy(sampler->bindless_state.map + p * 32,
+                sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
+      }
+   }
+
+   *pSampler = anv_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c b/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c
new file mode 100644
index 00000000000..55221799f32
--- /dev/null
+++ b/src/intel/vulkan_hasvk/gfx7_cmd_buffer.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+#include "vk_format.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+static uint32_t
+get_depth_format(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   switch (gfx->depth_att.vk_format) {
+   case VK_FORMAT_D16_UNORM:
+   case VK_FORMAT_D16_UNORM_S8_UINT:
+      return D16_UNORM;
+
+   case VK_FORMAT_X8_D24_UNORM_PACK32:
+   case VK_FORMAT_D24_UNORM_S8_UINT:
+      return D24_UNORM_X8_UINT;
+
+   case VK_FORMAT_D32_SFLOAT:
+   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+      return D32_FLOAT;
+
+   default:
+      return D16_UNORM;
+   }
+}
+
+void
+genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_RENDER_TARGETS)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
+      /* Take dynamic primitive topology in to account with
+       *    3DSTATE_SF::MultisampleRasterizationMode
+       */
+      VkPolygonMode dynamic_raster_mode =
+         genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
+                                   dyn->ia.primitive_topology);
+      uint32_t ms_rast_mode =
+         genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode);
+
+      bool aa_enable = anv_rasterization_aa_mode(dynamic_raster_mode,
+                                                 pipeline->line_mode);
+
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GENX(3DSTATE_SF) sf = {
+         GENX(3DSTATE_SF_header),
+         .DepthBufferSurfaceFormat = get_depth_format(cmd_buffer),
+         .LineWidth = dyn->rs.line.width,
+         .AntialiasingEnable = aa_enable,
+         .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
+         .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
+         .MultisampleRasterizationMode = ms_rast_mode,
+         .GlobalDepthOffsetEnableSolid       = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetEnableWireframe   = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetEnablePoint       = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetConstant          = dyn->rs.depth_bias.constant,
+         .GlobalDepthOffsetScale             = dyn->rs.depth_bias.slope,
+         .GlobalDepthOffsetClamp             = dyn->rs.depth_bias.clamp,
+      };
+      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx7.sf);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GENX(COLOR_CALC_STATE_length) * 4,
+                                            64);
+      struct GENX(COLOR_CALC_STATE) cc = {
+         .BlendConstantColorRed = dyn->cb.blend_constants[0],
+         .BlendConstantColorGreen = dyn->cb.blend_constants[1],
+         .BlendConstantColorBlue = dyn->cb.blend_constants[2],
+         .BlendConstantColorAlpha = dyn->cb.blend_constants[3],
+         .StencilReferenceValue = dyn->ds.stencil.front.reference & 0xff,
+         .BackfaceStencilReferenceValue = dyn->ds.stencil.back.reference & 0xff,
+      };
+      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
+         ccp.ColorCalcStatePointer = cc_state.offset;
+      }
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
+         ls.LineStipplePattern = dyn->rs.line.stipple.pattern;
+         ls.LineStippleInverseRepeatCount =
+            1.0f / MAX2(1, dyn->rs.line.stipple.factor);
+         ls.LineStippleRepeatCount = dyn->rs.line.stipple.factor;
+      }
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_RENDER_TARGETS)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
+      uint32_t depth_stencil_dw[GENX(DEPTH_STENCIL_STATE_length)];
+
+      VkImageAspectFlags ds_aspects = 0;
+      if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      struct vk_depth_stencil_state opt_ds = dyn->ds;
+      vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
+
+      struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
+         .DoubleSidedStencilEnable = true,
+
+         .StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff,
+         .StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff,
+
+         .BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff,
+         .BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff,
+
+         .DepthTestEnable = opt_ds.depth.test_enable,
+         .DepthBufferWriteEnable = opt_ds.depth.write_enable,
+         .DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op],
+         .StencilTestEnable = opt_ds.stencil.test_enable,
+         .StencilBufferWriteEnable = opt_ds.stencil.write_enable,
+         .StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail],
+         .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass],
+         .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail],
+         .StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare],
+         .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail],
+         .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass],
+         .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail],
+         .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare],
+      };
+      GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
+
+      struct anv_state ds_state =
+         anv_cmd_buffer_emit_dynamic(cmd_buffer, depth_stencil_dw,
+                                     sizeof(depth_stencil_dw), 64);
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), dsp) {
+         dsp.PointertoDEPTH_STENCIL_STATE = ds_state.offset;
+      }
+   }
+
+   if (cmd_buffer->state.gfx.index_buffer &&
+       ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                        ANV_CMD_DIRTY_INDEX_BUFFER)) ||
+        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))) {
+      struct anv_buffer *buffer = cmd_buffer->state.gfx.index_buffer;
+      uint32_t offset = cmd_buffer->state.gfx.index_offset;
+
+#if GFX_VERx10 == 75
+      anv_batch_emit(&cmd_buffer->batch, GFX75_3DSTATE_VF, vf) {
+         vf.IndexedDrawCutIndexEnable  = dyn->ia.primitive_restart_enable;
+         vf.CutIndex                   = cmd_buffer->state.gfx.restart_index;
+      }
+#endif
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+#if GFX_VERx10 != 75
+         ib.CutIndexEnable        = dyn->ia.primitive_restart_enable;
+#endif
+         ib.IndexFormat           = cmd_buffer->state.gfx.index_type;
+         ib.MOCS                  = anv_mocs(cmd_buffer->device,
+                                             buffer->address.bo,
+                                             ISL_SURF_USAGE_INDEX_BUFFER_BIT);
+
+         ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
+         ib.BufferEndingAddress   = anv_address_add(buffer->address,
+                                                    buffer->vk.size);
+      }
+   }
+
+   /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
+    * threads or if we have dirty dynamic primitive topology state and
+    * need to toggle 3DSTATE_WM::MultisampleRasterizationMode dynamically.
+    */
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      VkPolygonMode dynamic_raster_mode =
+         genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
+                                   dyn->ia.primitive_topology);
+
+      uint32_t dwords[GENX(3DSTATE_WM_length)];
+      struct GENX(3DSTATE_WM) wm = {
+         GENX(3DSTATE_WM_header),
+
+         .ThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+                                 (pipeline->force_fragment_thread_dispatch ||
+                                  !anv_cmd_buffer_all_color_write_masked(cmd_buffer)),
+         .MultisampleRasterizationMode =
+                                 genX(ms_rasterization_mode)(pipeline,
+                                                             dynamic_raster_mode),
+      };
+      GENX(3DSTATE_WM_pack)(NULL, dwords, &wm);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx7.wm);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS)) {
+      const uint32_t samples = MAX2(1, cmd_buffer->state.gfx.samples);
+      const struct vk_sample_locations_state *sl = dyn->ms.sample_locations;
+      genX(emit_multisample)(&cmd_buffer->batch, samples,
+                             sl->per_pixel == samples ? sl : NULL);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      const uint8_t color_writes = dyn->cb.color_write_enables;
+
+      /* Blend states of each RT */
+      uint32_t blend_dws[GENX(BLEND_STATE_length) +
+                         MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
+      uint32_t *dws = blend_dws;
+      memset(blend_dws, 0, sizeof(blend_dws));
+
+      /* Skip this part */
+      dws += GENX(BLEND_STATE_length);
+
+      for (uint32_t i = 0; i < MAX_RTS; i++) {
+         /* Disable anything above the current number of color attachments. */
+         bool write_disabled = i >= cmd_buffer->state.gfx.color_att_count ||
+                               (color_writes & BITFIELD_BIT(i)) == 0;
+         struct GENX(BLEND_STATE_ENTRY) entry = {
+            .WriteDisableAlpha = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_A_BIT) == 0,
+            .WriteDisableRed   = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_R_BIT) == 0,
+            .WriteDisableGreen = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_G_BIT) == 0,
+            .WriteDisableBlue  = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_B_BIT) == 0,
+            .LogicOpFunction   = genX(vk_to_intel_logic_op)[dyn->cb.logic_op],
+         };
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
+         dws += GENX(BLEND_STATE_ENTRY_length);
+      }
+
+      uint32_t num_dwords = GENX(BLEND_STATE_length) +
+         GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
+
+      struct anv_state blend_states =
+         anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
+                                      pipeline->gfx7.blend_state, num_dwords, 64);
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+         bsp.BlendStatePointer      = blend_states.offset;
+      }
+   }
+
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
+}
+
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
+                                bool enable)
+{
+   /* The NP PMA fix doesn't exist on gfx7 */
+}
diff --git a/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c b/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c
new file mode 100644
index 00000000000..8972a0c73fd
--- /dev/null
+++ b/src/intel/vulkan_hasvk/gfx8_cmd_buffer.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
+{
+   if (cmd_buffer->state.pma_fix_enabled == enable)
+      return;
+
+   cmd_buffer->state.pma_fix_enabled = enable;
+
+   /* According to the Broadwell PIPE_CONTROL documentation, software should
+    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
+    * prior to the LRI.  If stencil buffer writes are enabled, then a Render
+    * Cache Flush is also necessary.
+    *
+    * The Skylake docs say to use a depth stall rather than a command
+    * streamer stall.  However, the hardware seems to violently disagree.
+    * A full command streamer stall seems to be needed in both cases.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DepthCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
+      pc.RenderTargetCacheFlushEnable = true;
+#if GFX_VER >= 12
+      pc.TileCacheFlushEnable = true;
+
+      /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
+       * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
+       */
+      pc.DepthStallEnable = true;
+#endif
+   }
+
+#if GFX_VER == 9
+
+   uint32_t cache_mode;
+   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
+                   .STCPMAOptimizationEnable = enable,
+                   .STCPMAOptimizationEnableMask = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
+      lri.DataDWord        = cache_mode;
+   }
+
+#elif GFX_VER == 8
+
+   uint32_t cache_mode;
+   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
+                   .NPPMAFixEnable = enable,
+                   .NPEarlyZFailsDisable = enable,
+                   .NPPMAFixEnableMask = true,
+                   .NPEarlyZFailsDisableMask = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);
+      lri.DataDWord        = cache_mode;
+   }
+
+#endif /* GFX_VER == 8 */
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    *
+    * Again, the Skylake docs give a different set of flushes but the BDW
+    * flushes seem to work just as well.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.DepthStallEnable = true;
+      pc.DepthCacheFlushEnable = true;
+      pc.RenderTargetCacheFlushEnable = true;
+#if GFX_VER >= 12
+      pc.TileCacheFlushEnable = true;
+#endif
+   }
+}
+
+UNUSED static bool
+want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer,
+                   const struct vk_depth_stencil_state *ds)
+{
+   assert(GFX_VER == 8);
+
+   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
+    *
+    *    SW must set this bit in order to enable this fix when following
+    *    expression is TRUE.
+    *
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+    *    (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
+    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
+    *    (3DSTATE_PS_EXTRA::PixelShaderValid) &&
+    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+    *    (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
+    *    (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *       3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *       3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *       3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+    *      3DSTATE_WM::ForceKillPix != ForceOff &&
+    *      ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    *        3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+    *       (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *        3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+    *        3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+
+   /* These are always true:
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+    */
+
+   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+    * and there is no harm.
+    *
+    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   if (!cmd_buffer->state.hiz_enabled)
+      return false;
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+      return false;
+
+   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   if (wm_prog_data->early_fragment_tests)
+      return false;
+
+   /* We never use anv_pipeline for HiZ ops so this is trivially true:
+    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
+    */
+
+   /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
+   if (!ds->depth.test_enable)
+      return false;
+
+   /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *    3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *    3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *    3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
+    *   3DSTATE_WM::ForceKillPix != ForceOff &&
+    *   ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
+    *     3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
+    *    (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *     3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
+    *     3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
+    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+   return (pipeline->kill_pixel && (ds->depth.write_enable ||
+                                    ds->stencil.write_enable)) ||
+          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
+UNUSED static bool
+want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
+                     const struct vk_depth_stencil_state *ds)
+{
+   if (GFX_VER > 9)
+      return false;
+   assert(GFX_VER == 9);
+
+   /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
+    *
+    *    Clearing this bit will force the STC cache to wait for pending
+    *    retirement of pixels at the HZ-read stage and do the STC-test for
+    *    Non-promoted, R-computed and Computed depth modes instead of
+    *    postponing the STC-test to RCPFE.
+    *
+    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+    *
+    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+    *
+    *    COMP_STC_EN = STC_TEST_EN &&
+    *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
+    *
+    *    SW parses the pipeline states to generate the following logical
+    *    signal indicating if PMA FIX can be enabled.
+    *
+    *    STC_PMA_OPT =
+    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
+    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
+    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
+    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+    *       (COMP_STC_EN || STC_WRITE_EN) &&
+    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *         3DSTATE_WM::ForceKillPix == ON ||
+    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+
+   /* These are always true:
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+    */
+
+   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+    * and there is no harm.
+    *
+    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   if (!cmd_buffer->state.hiz_enabled)
+      return false;
+
+   /* We can't possibly know if HiZ is enabled without the depth attachment */
+   ASSERTED const struct anv_image_view *d_iview =
+      cmd_buffer->state.gfx.depth_att.iview;
+   assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+      return false;
+
+   /* !(3DSTATE_WM::EDSC_Mode == 2) */
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   if (wm_prog_data->early_fragment_tests)
+      return false;
+
+   /* We never use anv_pipeline for HiZ ops so this is trivially true:
+   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
+    */
+
+   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+    */
+   const bool stc_test_en = ds->stencil.test_enable;
+
+   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+    */
+   const bool stc_write_en = ds->stencil.write_enable;
+
+   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
+   const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
+
+   /* COMP_STC_EN || STC_WRITE_EN */
+   if (!(comp_stc_en || stc_write_en))
+      return false;
+
+   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *  3DSTATE_WM::ForceKillPix == ON ||
+    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
+    */
+   return pipeline->kill_pixel ||
+          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
+void
+genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+
+#if GFX_VER >= 11
+   if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
+      genX(emit_shading_rate)(&cmd_buffer->batch, pipeline, &dyn->fsr);
+#endif /* GFX_VER >= 11 */
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
+      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
+      struct GENX(3DSTATE_SF) sf = {
+         GENX(3DSTATE_SF_header),
+      };
+#if GFX_VER == 8
+      if (cmd_buffer->device->info->platform == INTEL_PLATFORM_CHV) {
+         sf.CHVLineWidth = dyn->rs.line.width;
+      } else {
+         sf.LineWidth = dyn->rs.line.width;
+      }
+#else
+      sf.LineWidth = dyn->rs.line.width,
+#endif
+      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
+      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx8.sf);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
+      /* Take dynamic primitive topology in to account with
+       *    3DSTATE_RASTER::APIMode
+       *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
+       *    3DSTATE_RASTER::AntialiasingEnable
+       */
+      uint32_t api_mode = 0;
+      bool msaa_raster_enable = false;
+
+      VkPolygonMode dynamic_raster_mode =
+         genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
+                                   dyn->ia.primitive_topology);
+
+      genX(rasterization_mode)(dynamic_raster_mode,
+                               pipeline->line_mode, dyn->rs.line.width,
+                               &api_mode, &msaa_raster_enable);
+
+      bool aa_enable = anv_rasterization_aa_mode(dynamic_raster_mode,
+                                                 pipeline->line_mode);
+
+      uint32_t raster_dw[GENX(3DSTATE_RASTER_length)];
+      struct GENX(3DSTATE_RASTER) raster = {
+         GENX(3DSTATE_RASTER_header),
+         .APIMode = api_mode,
+         .DXMultisampleRasterizationEnable = msaa_raster_enable,
+         .AntialiasingEnable = aa_enable,
+         .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
+         .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
+         .GlobalDepthOffsetEnableSolid       = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetEnableWireframe   = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetEnablePoint       = dyn->rs.depth_bias.enable,
+         .GlobalDepthOffsetConstant          = dyn->rs.depth_bias.constant,
+         .GlobalDepthOffsetScale             = dyn->rs.depth_bias.slope,
+         .GlobalDepthOffsetClamp             = dyn->rs.depth_bias.clamp,
+      };
+      GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster);
+      anv_batch_emit_merge(&cmd_buffer->batch, raster_dw,
+                           pipeline->gfx8.raster);
+   }
+
+   /* Stencil reference values moved from COLOR_CALC_STATE in gfx8 to
+    * 3DSTATE_WM_DEPTH_STENCIL in gfx9. That means the dirty bits gets split
+    * across different state packets for gfx8 and gfx9. We handle that by
+    * using a big old #if switch here.
+    */
+#if GFX_VER == 8
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GENX(COLOR_CALC_STATE_length) * 4,
+                                            64);
+      struct GENX(COLOR_CALC_STATE) cc = {
+         .BlendConstantColorRed = dyn->cb.blend_constants[0],
+         .BlendConstantColorGreen = dyn->cb.blend_constants[1],
+         .BlendConstantColorBlue = dyn->cb.blend_constants[2],
+         .BlendConstantColorAlpha = dyn->cb.blend_constants[3],
+         .StencilReferenceValue = dyn->ds.stencil.front.reference & 0xff,
+         .BackfaceStencilReferenceValue = dyn->ds.stencil.back.reference & 0xff,
+      };
+      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
+         ccp.ColorCalcStatePointer        = cc_state.offset;
+         ccp.ColorCalcStatePointerValid   = true;
+      }
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_RENDER_TARGETS)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
+      VkImageAspectFlags ds_aspects = 0;
+      if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      struct vk_depth_stencil_state opt_ds = dyn->ds;
+      vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
+         ds.DoubleSidedStencilEnable = true;
+
+         ds.StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff;
+         ds.StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff;
+
+         ds.BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff;
+         ds.BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff;
+
+         ds.DepthTestEnable = opt_ds.depth.test_enable;
+         ds.DepthBufferWriteEnable = opt_ds.depth.write_enable;
+         ds.DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op];
+         ds.StencilTestEnable = opt_ds.stencil.test_enable;
+         ds.StencilBufferWriteEnable = opt_ds.stencil.write_enable;
+         ds.StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail];
+         ds.StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass];
+         ds.StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail];
+         ds.StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare];
+         ds.BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail];
+         ds.BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass];
+         ds.BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail];
+         ds.BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare];
+      }
+
+      const bool pma = want_depth_pma_fix(cmd_buffer, &opt_ds);
+      genX(cmd_buffer_enable_pma_fix)(cmd_buffer, pma);
+   }
+#else
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
+      struct anv_state cc_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GENX(COLOR_CALC_STATE_length) * 4,
+                                            64);
+      struct GENX(COLOR_CALC_STATE) cc = {
+         .BlendConstantColorRed = dyn->cb.blend_constants[0],
+         .BlendConstantColorGreen = dyn->cb.blend_constants[1],
+         .BlendConstantColorBlue = dyn->cb.blend_constants[2],
+         .BlendConstantColorAlpha = dyn->cb.blend_constants[3],
+      };
+      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
+         ccp.ColorCalcStatePointer = cc_state.offset;
+         ccp.ColorCalcStatePointerValid = true;
+      }
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_RENDER_TARGETS)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
+      VkImageAspectFlags ds_aspects = 0;
+      if (cmd_buffer->state.gfx.depth_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (cmd_buffer->state.gfx.stencil_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      struct vk_depth_stencil_state opt_ds = dyn->ds;
+      vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
+         ds.DoubleSidedStencilEnable = true;
+
+         ds.StencilTestMask = opt_ds.stencil.front.compare_mask & 0xff;
+         ds.StencilWriteMask = opt_ds.stencil.front.write_mask & 0xff;
+
+         ds.BackfaceStencilTestMask = opt_ds.stencil.back.compare_mask & 0xff;
+         ds.BackfaceStencilWriteMask = opt_ds.stencil.back.write_mask & 0xff;
+
+         ds.StencilReferenceValue = opt_ds.stencil.front.reference & 0xff;
+         ds.BackfaceStencilReferenceValue = opt_ds.stencil.back.reference & 0xff;
+
+         ds.DepthTestEnable = opt_ds.depth.test_enable;
+         ds.DepthBufferWriteEnable = opt_ds.depth.write_enable;
+         ds.DepthTestFunction = genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op];
+         ds.StencilTestEnable = opt_ds.stencil.test_enable;
+         ds.StencilBufferWriteEnable = opt_ds.stencil.write_enable;
+         ds.StencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail];
+         ds.StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass];
+         ds.StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail];
+         ds.StencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare];
+         ds.BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail];
+         ds.BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass];
+         ds.BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail];
+         ds.BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare];
+      }
+
+      const bool pma = want_stencil_pma_fix(cmd_buffer, &opt_ds);
+      genX(cmd_buffer_enable_pma_fix)(cmd_buffer, pma);
+   }
+#endif
+
+#if GFX_VER >= 12
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+         db.DepthBoundsTestEnable = dyn->ds.depth.bounds_test.enable;
+         db.DepthBoundsTestMinValue = dyn->ds.depth.bounds_test.min;
+         db.DepthBoundsTestMaxValue = dyn->ds.depth.bounds_test.max;
+      }
+   }
+#endif
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
+         ls.LineStipplePattern = dyn->rs.line.stipple.pattern;
+         ls.LineStippleInverseRepeatCount =
+            1.0f / MAX2(1, dyn->rs.line.stipple.factor);
+         ls.LineStippleRepeatCount = dyn->rs.line.stipple.factor;
+      }
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
+                                       ANV_CMD_DIRTY_INDEX_BUFFER)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
+#if GFX_VERx10 >= 125
+         vf.GeometryDistributionEnable = true;
+#endif
+         vf.IndexedDrawCutIndexEnable  = dyn->ia.primitive_restart_enable;
+         vf.CutIndex                   = cmd_buffer->state.gfx.restart_index;
+      }
+   }
+
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDEX_BUFFER) {
+      struct anv_buffer *buffer = cmd_buffer->state.gfx.index_buffer;
+      uint32_t offset = cmd_buffer->state.gfx.index_offset;
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+         ib.IndexFormat           = cmd_buffer->state.gfx.index_type;
+         ib.MOCS                  = anv_mocs(cmd_buffer->device,
+                                             buffer->address.bo,
+                                             ISL_SURF_USAGE_INDEX_BUFFER_BIT);
+#if GFX_VER >= 12
+         ib.L3BypassDisable       = true;
+#endif
+         ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
+         ib.BufferSize            = vk_buffer_range(&buffer->vk, offset,
+                                                    VK_WHOLE_SIZE);
+      }
+   }
+
+#if GFX_VERx10 >= 125
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
+         /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
+         vfg.DistributionMode =
+            anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
+                                                                      RR_FREE;
+         vfg.DistributionGranularity = BatchLevelGranularity;
+         /* Wa_14014890652 */
+         if (intel_device_info_is_dg2(cmd_buffer->device->info))
+            vfg.GranularityThresholdDisable = 1;
+         vfg.ListCutIndexEnable = dyn->ia.primitive_restart_enable;
+         /* 192 vertices for TRILIST_ADJ */
+         vfg.ListNBatchSizeScale = 0;
+         /* Batch size of 384 vertices */
+         vfg.List3BatchSizeScale = 2;
+         /* Batch size of 128 vertices */
+         vfg.List2BatchSizeScale = 1;
+         /* Batch size of 128 vertices */
+         vfg.List1BatchSizeScale = 2;
+         /* Batch size of 256 vertices for STRIP topologies */
+         vfg.StripBatchSizeScale = 3;
+         /* 192 control points for PATCHLIST_3 */
+         vfg.PatchBatchSizeScale = 1;
+         /* 192 control points for PATCHLIST_3 */
+         vfg.PatchBatchSizeMultiplier = 31;
+      }
+   }
+#endif
+
+   if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations &&
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS))
+      genX(emit_sample_pattern)(&cmd_buffer->batch, dyn->ms.sample_locations);
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
+       * threads.
+       */
+      uint32_t wm_dwords[GENX(3DSTATE_WM_length)];
+      struct GENX(3DSTATE_WM) wm = {
+         GENX(3DSTATE_WM_header),
+
+         .ForceThreadDispatchEnable = anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+                                      (pipeline->force_fragment_thread_dispatch ||
+                                       anv_cmd_buffer_all_color_write_masked(cmd_buffer)) ?
+                                      ForceON : 0,
+      };
+      GENX(3DSTATE_WM_pack)(NULL, wm_dwords, &wm);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, wm_dwords, pipeline->gfx8.wm);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      const uint8_t color_writes = dyn->cb.color_write_enables;
+      const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx;
+      bool has_writeable_rt =
+         anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+         (color_writes & ((1u << state->color_att_count) - 1)) != 0;
+
+      /* 3DSTATE_PS_BLEND to be consistent with the rest of the
+       * BLEND_STATE_ENTRY.
+       */
+      uint32_t ps_blend_dwords[GENX(3DSTATE_PS_BLEND_length)];
+      struct GENX(3DSTATE_PS_BLEND) ps_blend = {
+         GENX(3DSTATE_PS_BLEND_header),
+         .HasWriteableRT = has_writeable_rt,
+      };
+      GENX(3DSTATE_PS_BLEND_pack)(NULL, ps_blend_dwords, &ps_blend);
+      anv_batch_emit_merge(&cmd_buffer->batch, ps_blend_dwords,
+                           pipeline->gfx8.ps_blend);
+
+      uint32_t blend_dws[GENX(BLEND_STATE_length) +
+                         MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
+      uint32_t *dws = blend_dws;
+      memset(blend_dws, 0, sizeof(blend_dws));
+
+      /* Skip this part */
+      dws += GENX(BLEND_STATE_length);
+
+      for (uint32_t i = 0; i < MAX_RTS; i++) {
+         /* Disable anything above the current number of color attachments. */
+         bool write_disabled = i >= cmd_buffer->state.gfx.color_att_count ||
+                               (color_writes & BITFIELD_BIT(i)) == 0;
+         struct GENX(BLEND_STATE_ENTRY) entry = {
+            .WriteDisableAlpha = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_A_BIT) == 0,
+            .WriteDisableRed   = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_R_BIT) == 0,
+            .WriteDisableGreen = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_G_BIT) == 0,
+            .WriteDisableBlue  = write_disabled ||
+                                 (pipeline->color_comp_writes[i] &
+                                  VK_COLOR_COMPONENT_B_BIT) == 0,
+            .LogicOpFunction   = genX(vk_to_intel_logic_op)[dyn->cb.logic_op],
+         };
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
+         dws += GENX(BLEND_STATE_ENTRY_length);
+      }
+
+      uint32_t num_dwords = GENX(BLEND_STATE_length) +
+         GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
+
+      struct anv_state blend_states =
+         anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
+                                      pipeline->gfx8.blend_state, num_dwords, 64);
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+         bsp.BlendStatePointer      = blend_states.offset;
+         bsp.BlendStatePointerValid = true;
+      }
+   }
+
+   /* When we're done, there is no more dirty gfx state. */
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+   cmd_buffer->state.gfx.dirty = 0;
+}
diff --git a/src/intel/vulkan_hasvk/meson.build b/src/intel/vulkan_hasvk/meson.build
new file mode 100644
index 00000000000..69e4341e1d9
--- /dev/null
+++ b/src/intel/vulkan_hasvk/meson.build
@@ -0,0 +1,265 @@
+# Copyright © 2017-2019 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+anv_hasvk_entrypoints = custom_target(
+  'anv_hasvk_entrypoints',
+  input : [vk_entrypoints_gen, vk_api_xml],
+  output : ['anv_entrypoints.h', 'anv_entrypoints.c'],
+  command : [
+    prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
+    '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'anv',
+    '--device-prefix', 'gfx7', '--device-prefix', 'gfx75',
+    '--device-prefix', 'gfx8', '--device-prefix', 'gfx9',
+    '--device-prefix', 'gfx11', '--device-prefix', 'gfx12',
+    '--device-prefix', 'gfx125',
+  ],
+  depend_files : vk_entrypoints_gen_depend_files,
+)
+
+intel_hasvk_icd = custom_target(
+  'intel_hasvk_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : 'intel_hasvk_icd.@0@.json'.format(host_machine.cpu()),
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
+                             'libvulkan_intel_hasvk.so'),
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+  install_dir : with_vulkan_icd_dir,
+  install : true,
+)
+
+if meson.version().version_compare('>= 0.58')
+  _dev_icdname = 'intel_hasvk_devenv_icd.@0@.json'.format(host_machine.cpu())
+  custom_target(
+    'intel_hasvk_devenv_icd',
+    input : [vk_icd_gen, vk_api_xml],
+    output : _dev_icdname,
+    command : [
+      prog_python, '@INPUT0@',
+      '--api-version', '1.3', '--xml', '@INPUT1@',
+      '--lib-path', meson.current_build_dir() / 'libvulkan_intel_hasvk.so',
+      '--out', '@OUTPUT@',
+    ],
+    build_by_default : true,
+  )
+
+  devenv.append('VK_ICD_FILENAMES', meson.current_build_dir() / _dev_icdname)
+endif
+
+libanv_per_hw_ver_libs = []
+anv_per_hw_ver_files = files(
+  'genX_blorp_exec.c',
+  'genX_cmd_buffer.c',
+  'genX_gpu_memcpy.c',
+  'genX_pipeline.c',
+  'genX_query.c',
+  'genX_state.c',
+)
+foreach g : [['70', ['gfx7_cmd_buffer.c']], ['75', ['gfx7_cmd_buffer.c']],
+             ['80', ['gfx8_cmd_buffer.c']], ['90', ['gfx8_cmd_buffer.c']],
+             ['110', ['gfx8_cmd_buffer.c']], ['120', ['gfx8_cmd_buffer.c']],
+             ['125', ['gfx8_cmd_buffer.c']]]
+  _gfx_ver = g[0]
+  libanv_per_hw_ver_libs += static_library(
+    'anv_per_hw_ver@0@'.format(_gfx_ver),
+    [anv_per_hw_ver_files, g[1], anv_hasvk_entrypoints[0]],
+    include_directories : [
+      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel,
+    ],
+    c_args : [
+      no_override_init_args, c_sse2_args,
+      '-DGFX_VERx10=@0@'.format(_gfx_ver),
+    ],
+    gnu_symbol_visibility : 'hidden',
+    dependencies : [
+      dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
+      idep_vulkan_util_headers, idep_vulkan_wsi_headers,
+      idep_vulkan_runtime_headers, idep_intel_driver_ds_headers,
+    ],
+  )
+endforeach
+
+libanv_files = files(
+  'anv_acceleration_structure.c',
+  'anv_allocator.c',
+  'anv_android.h',
+  'anv_batch_chain.c',
+  'anv_blorp.c',
+  'anv_bo_sync.c',
+  'anv_cmd_buffer.c',
+  'anv_descriptor_set.c',
+  'anv_device.c',
+  'anv_formats.c',
+  'anv_genX.h',
+  'anv_image.c',
+  'anv_measure.c',
+  'anv_measure.h',
+  'anv_nir.h',
+  'anv_nir_add_base_work_group_id.c',
+  'anv_nir_apply_pipeline_layout.c',
+  'anv_nir_compute_push_layout.c',
+  'anv_nir_lower_multiview.c',
+  'anv_nir_lower_ubo_loads.c',
+  'anv_nir_lower_ycbcr_textures.c',
+  'anv_perf.c',
+  'anv_pipeline.c',
+  'anv_pipeline_cache.c',
+  'anv_private.h',
+  'anv_queue.c',
+  'anv_util.c',
+  'anv_utrace.c',
+  'anv_wsi.c',
+)
+
+anv_deps = [
+  dep_libdrm,
+  dep_valgrind,
+  idep_genxml,
+  idep_nir_headers,
+  idep_vulkan_util_headers,
+  idep_vulkan_runtime_headers,
+  idep_vulkan_wsi_headers,
+]
+anv_flags = [
+  no_override_init_args,
+  c_sse2_args,
+]
+
+anv_cpp_flags = []
+
+if with_platform_x11
+  anv_deps += dep_xcb_dri3
+endif
+
+if with_platform_wayland
+  anv_deps += dep_wayland_client
+endif
+
+if with_xlib_lease
+  anv_deps += [dep_xlib_xrandr]
+endif
+
+if with_platform_android
+  libanv_files += files('anv_android.c')
+else
+  libanv_files += files('anv_android_stubs.c')
+endif
+
+anv_deps += idep_intel_driver_ds_headers
+
+libanv_hasvk_common = static_library(
+  'anv_hasvk_common',
+  [
+    libanv_files, anv_hasvk_entrypoints, sha1_h,
+    gen_xml_pack,
+  ],
+  include_directories : [
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
+    inc_util,
+  ],
+  c_args : anv_flags,
+  cpp_args : anv_cpp_flags,
+  gnu_symbol_visibility : 'hidden',
+  dependencies : anv_deps,
+)
+
+libvulkan_intel_hasvk = shared_library(
+  'vulkan_intel_hasvk',
+  [files('anv_gem.c'), anv_hasvk_entrypoints[0]],
+  include_directories : [
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
+  ],
+  link_whole : [libanv_hasvk_common, libanv_per_hw_ver_libs],
+  link_with : [
+    libintel_compiler, libintel_dev, libisl, libblorp, libintel_perf,
+  ],
+  dependencies : [
+    dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
+    idep_nir, idep_genxml, idep_vulkan_util, idep_vulkan_wsi,
+    idep_vulkan_runtime, idep_mesautil, idep_xmlconfig,
+    idep_intel_driver_ds,
+  ],
+  c_args : anv_flags,
+  gnu_symbol_visibility : 'hidden',
+  link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+  install : true,
+)
+
+if with_symbols_check
+  test(
+    'anv symbols check',
+    symbols_check,
+    args : [
+      '--lib', libvulkan_intel_hasvk,
+      '--symbols-file', vulkan_icd_symbols,
+      symbols_check_args,
+    ],
+    suite : ['intel'],
+  )
+endif
+
+if with_tests
+  libvulkan_intel_hasvk_test = static_library(
+    'vulkan_intel_hasvk_test',
+    [files('anv_gem_stubs.c'), anv_hasvk_entrypoints[0]],
+    include_directories : [
+      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
+    ],
+    link_whole : libanv_hasvk_common,
+    link_with : [
+      libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
+      libisl, libblorp, libintel_perf,
+    ],
+    dependencies : [
+      dep_thread, dep_dl, dep_m, anv_deps,
+      idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,
+      idep_mesautil,
+    ],
+    c_args : anv_flags,
+    gnu_symbol_visibility : 'hidden',
+  )
+
+  foreach t : ['block_pool_no_free', 'block_pool_grow_first',
+               'state_pool_no_free', 'state_pool_free_list_only',
+               'state_pool', 'state_pool_padding']
+    test(
+      'anv_hasvk_@0@'.format(t),
+      executable(
+        t,
+        ['tests/@0@.c'.format(t), anv_hasvk_entrypoints[0]],
+        c_args : [ c_sse2_args ],
+        link_with : libvulkan_intel_hasvk_test,
+        dependencies : [
+          dep_libdrm, dep_thread, dep_m, dep_valgrind,
+          idep_vulkan_util, idep_vulkan_wsi_headers,
+          idep_vulkan_runtime, idep_intel_driver_ds,
+        ],
+        include_directories : [
+          inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
+        ],
+      ),
+      suite : ['intel'],
+    )
+  endforeach
+endif
diff --git a/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c b/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c
new file mode 100644
index 00000000000..109275b07cc
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/block_pool_grow_first.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "test_common.h"
+
+int main(void)
+{
+   struct anv_physical_device physical_device = {
+      .use_softpin = true,
+   };
+   struct anv_device device = {};
+   struct anv_block_pool pool;
+
+   /* Create a pool with initial size smaller than the block allocated, so
+    * that it must grow in the first allocation.
+    */
+   const uint32_t block_size = 16 * 1024;
+   const uint32_t initial_size = block_size / 2;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_block_pool_init(&pool, &device, "test", 4096, initial_size);
+   ASSERT(pool.size == initial_size);
+
+   uint32_t padding;
+   int32_t offset = anv_block_pool_alloc(&pool, block_size, &padding);
+
+   /* Pool will have grown at least space to fit the new allocation. */
+   ASSERT(pool.size > initial_size);
+   ASSERT(pool.size >= initial_size + block_size);
+
+   /* The whole initial size is considered padding and the allocation should be
+    * right next to it.
+    */
+   ASSERT(padding == initial_size);
+   ASSERT(offset == initial_size);
+
+   /* Use the memory to ensure it is valid. */
+   void *map = anv_block_pool_map(&pool, offset, block_size);
+   memset(map, 22, block_size);
+
+   anv_block_pool_finish(&pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan_hasvk/tests/block_pool_no_free.c b/src/intel/vulkan_hasvk/tests/block_pool_no_free.c
new file mode 100644
index 00000000000..e0e24dcc0c4
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/block_pool_no_free.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 16
+#define BLOCKS_PER_THREAD 1024
+#define NUM_RUNS 64
+
+struct job {
+   pthread_t thread;
+   unsigned id;
+   struct anv_block_pool *pool;
+   int32_t blocks[BLOCKS_PER_THREAD];
+   int32_t back_blocks[BLOCKS_PER_THREAD];
+} jobs[NUM_THREADS];
+
+
+static void *alloc_blocks(void *_job)
+{
+   struct job *job = _job;
+   uint32_t job_id = job - jobs;
+   uint32_t block_size = 16 * ((job_id % 4) + 1);
+   int32_t block, *data;
+
+   for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
+      block = anv_block_pool_alloc(job->pool, block_size, NULL);
+      data = anv_block_pool_map(job->pool, block, block_size);
+      *data = block;
+      ASSERT(block >= 0);
+      job->blocks[i] = block;
+
+      block = anv_block_pool_alloc_back(job->pool, block_size);
+      data = anv_block_pool_map(job->pool, block, block_size);
+      *data = block;
+      ASSERT(block < 0);
+      job->back_blocks[i] = -block;
+   }
+
+   for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
+      block = job->blocks[i];
+      data = anv_block_pool_map(job->pool, block, block_size);
+      ASSERT(*data == block);
+
+      block = -job->back_blocks[i];
+      data = anv_block_pool_map(job->pool, block, block_size);
+      ASSERT(*data == block);
+   }
+
+   return NULL;
+}
+
+static void validate_monotonic(int32_t **blocks)
+{
+   /* A list of indices, one per thread */
+   unsigned next[NUM_THREADS];
+   memset(next, 0, sizeof(next));
+
+   int highest = -1;
+   while (true) {
+      /* First, we find which thread has the lowest next element */
+      int32_t thread_min = INT32_MAX;
+      int min_thread_idx = -1;
+      for (unsigned i = 0; i < NUM_THREADS; i++) {
+         if (next[i] >= BLOCKS_PER_THREAD)
+            continue;
+
+         if (thread_min > blocks[i][next[i]]) {
+            thread_min = blocks[i][next[i]];
+            min_thread_idx = i;
+         }
+      }
+
+      /* The only way this can happen is if all of the next[] values are at
+       * BLOCKS_PER_THREAD, in which case, we're done.
+       */
+      if (thread_min == INT32_MAX)
+         break;
+
+      /* That next element had better be higher than the previous highest */
+      ASSERT(blocks[min_thread_idx][next[min_thread_idx]] > highest);
+
+      highest = blocks[min_thread_idx][next[min_thread_idx]];
+      next[min_thread_idx]++;
+   }
+}
+
+static void run_test()
+{
+   struct anv_physical_device physical_device = {
+      .use_relocations = true,
+   };
+   struct anv_device device = {};
+   struct anv_block_pool pool;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_block_pool_init(&pool, &device, "test", 4096, 4096);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = &pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_blocks, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+
+   /* Validate that the block allocations were monotonic */
+   int32_t *block_ptrs[NUM_THREADS];
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      block_ptrs[i] = jobs[i].blocks;
+   validate_monotonic(block_ptrs);
+
+   /* Validate that the back block allocations were monotonic */
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      block_ptrs[i] = jobs[i].back_blocks;
+   validate_monotonic(block_ptrs);
+
+   anv_block_pool_finish(&pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+int main(void)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test();
+}
diff --git a/src/intel/vulkan_hasvk/tests/state_pool.c b/src/intel/vulkan_hasvk/tests/state_pool.c
new file mode 100644
index 00000000000..57cfa73d54e
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/state_pool.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 8
+#define STATES_PER_THREAD_LOG2 10
+#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
+#define NUM_RUNS 64
+
+#include "state_pool_test_helper.h"
+
+int main(void)
+{
+   struct anv_physical_device physical_device = { };
+   struct anv_device device = {};
+   struct anv_state_pool state_pool;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+
+   for (unsigned i = 0; i < NUM_RUNS; i++) {
+      anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 256);
+
+      /* Grab one so a zero offset is impossible */
+      anv_state_pool_alloc(&state_pool, 16, 16);
+
+      run_state_pool_test(&state_pool);
+
+      anv_state_pool_finish(&state_pool);
+   }
+
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c b/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c
new file mode 100644
index 00000000000..602346fedae
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/state_pool_free_list_only.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 8
+#define STATES_PER_THREAD_LOG2 12
+#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
+
+#include "state_pool_test_helper.h"
+
+int main(void)
+{
+   struct anv_physical_device physical_device = { };
+   struct anv_device device = {};
+   struct anv_state_pool state_pool;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+
+   /* Grab one so a zero offset is impossible */
+   anv_state_pool_alloc(&state_pool, 16, 16);
+
+   /* Grab and return enough states that the state pool test below won't
+    * actually ever resize anything.
+    */
+   {
+      struct anv_state states[NUM_THREADS * STATES_PER_THREAD];
+      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) {
+         states[i] = anv_state_pool_alloc(&state_pool, 16, 16);
+         ASSERT(states[i].offset != 0);
+      }
+
+      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++)
+         anv_state_pool_free(&state_pool, states[i]);
+   }
+
+   run_state_pool_test(&state_pool);
+
+   anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan_hasvk/tests/state_pool_no_free.c b/src/intel/vulkan_hasvk/tests/state_pool_no_free.c
new file mode 100644
index 00000000000..fe076830406
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/state_pool_no_free.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 16
+#define STATES_PER_THREAD 1024
+#define NUM_RUNS 64
+
+struct job {
+   pthread_t thread;
+   unsigned id;
+   struct anv_state_pool *pool;
+   uint32_t offsets[STATES_PER_THREAD];
+} jobs[NUM_THREADS];
+
+pthread_barrier_t barrier;
+
+static void *alloc_states(void *_job)
+{
+   struct job *job = _job;
+
+   pthread_barrier_wait(&barrier);
+
+   for (unsigned i = 0; i < STATES_PER_THREAD; i++) {
+      struct anv_state state = anv_state_pool_alloc(job->pool, 16, 16);
+      job->offsets[i] = state.offset;
+   }
+
+   return NULL;
+}
+
+static void run_test()
+{
+   struct anv_physical_device physical_device = { };
+   struct anv_device device = {};
+   struct anv_state_pool state_pool;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 64);
+
+   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = &state_pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+
+   /* A list of indices, one per thread */
+   unsigned next[NUM_THREADS];
+   memset(next, 0, sizeof(next));
+
+   int highest = -1;
+   while (true) {
+      /* First, we find which thread has the highest next element */
+      int thread_max = -1;
+      int max_thread_idx = -1;
+      for (unsigned i = 0; i < NUM_THREADS; i++) {
+         if (next[i] >= STATES_PER_THREAD)
+            continue;
+
+         if (thread_max < jobs[i].offsets[next[i]]) {
+            thread_max = jobs[i].offsets[next[i]];
+            max_thread_idx = i;
+         }
+      }
+
+      /* The only way this can happen is if all of the next[] values are at
+       * BLOCKS_PER_THREAD, in which case, we're done.
+       */
+      if (thread_max == -1)
+         break;
+
+      /* That next element had better be higher than the previous highest */
+      ASSERT(jobs[max_thread_idx].offsets[next[max_thread_idx]] > highest);
+
+      highest = jobs[max_thread_idx].offsets[next[max_thread_idx]];
+      next[max_thread_idx]++;
+   }
+
+   anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+int main(void)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test();
+}
diff --git a/src/intel/vulkan_hasvk/tests/state_pool_padding.c b/src/intel/vulkan_hasvk/tests/state_pool_padding.c
new file mode 100644
index 00000000000..0ed72e1e502
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/state_pool_padding.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "test_common.h"
+
+int main(void)
+{
+   struct anv_physical_device physical_device = {
+      .use_softpin = true,
+   };
+   struct anv_device device = {};
+   struct anv_state_pool state_pool;
+
+   anv_device_set_physical(&device, &physical_device);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+
+   /* Get the size of the underlying block_pool */
+   struct anv_block_pool *bp = &state_pool.block_pool;
+   uint64_t pool_size = bp->size;
+
+   /* Grab one so the pool has some initial usage */
+   anv_state_pool_alloc(&state_pool, 16, 16);
+
+   /* Grab a state that is the size of the initial allocation */
+   struct anv_state state = anv_state_pool_alloc(&state_pool, pool_size, 16);
+
+   /* The pool must have grown */
+   ASSERT(bp->size > pool_size);
+
+   /* And the state must have been allocated at the end of the original size  */
+   ASSERT(state.offset == pool_size);
+
+   /* A new allocation that fits into the returned empty space should have an
+    * offset within the original pool size
+    */
+   state = anv_state_pool_alloc(&state_pool, 4096, 16);
+   ASSERT(state.offset + state.alloc_size <= pool_size);
+
+   /* We should be able to allocate pool->block_size'd chunks in the returned area
+    */
+   int left_chunks = pool_size / 4096 - 2;
+   for (int i = 0; i < left_chunks; i++) {
+      state = anv_state_pool_alloc(&state_pool, 4096, 16);
+      ASSERT(state.offset + state.alloc_size <= pool_size);
+   }
+
+   /* Now the next chunk to be allocated should make the pool grow again */
+   pool_size = bp->size;
+   state = anv_state_pool_alloc(&state_pool, 4096, 16);
+   ASSERT(bp->size > pool_size);
+   ASSERT(state.offset == pool_size);
+
+   anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h b/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h
new file mode 100644
index 00000000000..f22a28ecc6f
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/state_pool_test_helper.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+struct job {
+   struct anv_state_pool *pool;
+   unsigned id;
+   pthread_t thread;
+} jobs[NUM_THREADS];
+
+pthread_barrier_t barrier;
+
+static void *alloc_states(void *void_job)
+{
+   struct job *job = void_job;
+
+   const unsigned chunk_size = 1 << (job->id % STATES_PER_THREAD_LOG2);
+   const unsigned num_chunks = STATES_PER_THREAD / chunk_size;
+
+   struct anv_state states[chunk_size];
+
+   pthread_barrier_wait(&barrier);
+
+   for (unsigned c = 0; c < num_chunks; c++) {
+      for (unsigned i = 0; i < chunk_size; i++) {
+         states[i] = anv_state_pool_alloc(job->pool, 16, 16);
+         memset(states[i].map, 139, 16);
+         ASSERT(states[i].offset != 0);
+      }
+
+      for (unsigned i = 0; i < chunk_size; i++)
+         anv_state_pool_free(job->pool, states[i]);
+   }
+
+   return NULL;
+}
+
+static void run_state_pool_test(struct anv_state_pool *state_pool)
+{
+   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+   for (unsigned i = 0; i < NUM_THREADS; i++) {
+      jobs[i].pool = state_pool;
+      jobs[i].id = i;
+      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < NUM_THREADS; i++)
+      pthread_join(jobs[i].thread, NULL);
+}
diff --git a/src/intel/vulkan_hasvk/tests/test_common.h b/src/intel/vulkan_hasvk/tests/test_common.h
new file mode 100644
index 00000000000..3f883e3bdcd
--- /dev/null
+++ b/src/intel/vulkan_hasvk/tests/test_common.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT(cond)                                                    \
+   do {                                                                 \
+      if (!(cond)) {                                                    \
+         fprintf(stderr, "%s:%d: Test assertion `%s` failed.\n",        \
+                 __FILE__, __LINE__, # cond);                           \
+         abort();                                                       \
+      }                                                                 \
+   } while (false)