From 5a594109623fff43fcd3b874b160c711e81bf55d Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 6 Jan 2023 18:59:36 +0100 Subject: [PATCH] turnip: add cached and cached-coherent memory types vkd3d requires cached memory type. MSM backend doesn't have a special ioctl for memory flushing/invalidation, we'd have to use cvac and civac arm assembly instructions (would be done in following commit). KGSL has an the ioctl for this, which is used in this commit. Note, CTS tests doesn't seem good at testing flushing and invalidating, the ones I found passed on KGSL with both functions being no-op. Based on the old patch from Jonathan Marek. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7636 Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/vulkan/tu_device.cc | 74 ++++++++++++-------- src/freedreno/vulkan/tu_device.h | 8 +++ src/freedreno/vulkan/tu_image.cc | 11 +-- src/freedreno/vulkan/tu_knl.cc | 3 +- src/freedreno/vulkan/tu_knl.h | 14 +++- src/freedreno/vulkan/tu_knl_drm_msm.cc | 53 ++++++++++++-- src/freedreno/vulkan/tu_knl_kgsl.cc | 97 ++++++++++++++++++++++++++ 7 files changed, 217 insertions(+), 43 deletions(-) diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index d8acc58b2d9..4d503e57073 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -326,6 +326,29 @@ tu_physical_device_init(struct tu_physical_device *device, goto fail_free_name; } + device->memory.type_count = 1; + device->memory.types[0] = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + + if (device->has_cached_coherent_memory) { + device->memory.types[device->memory.type_count] = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + device->memory.type_count++; + } + + if (device->has_cached_non_coherent_memory) { + device->memory.types[device->memory.type_count] = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + device->memory.type_count++; + } + if (device->has_set_iova) { mtx_init(&device->vma_mutex, mtx_plain); util_vma_heap_init(&device->vma, device->va_start, @@ -1645,12 +1668,13 @@ tu_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice pdev, props->memoryHeaps[0].size = physical_device->heap.size; props->memoryHeaps[0].flags = physical_device->heap.flags; - props->memoryTypeCount = 1; - props->memoryTypes[0].propertyFlags = - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - props->memoryTypes[0].heapIndex = 0; + props->memoryTypeCount = physical_device->memory.type_count; + for (uint32_t i = 0; i < physical_device->memory.type_count; i++) { + props->memoryTypes[i] = (VkMemoryType) { + .propertyFlags = physical_device->memory.types[i], + .heapIndex = 0, + }; + } vk_foreach_struct(ext, props2->pNext) { @@ -2673,9 +2697,11 @@ tu_AllocateMemory(VkDevice _device, if (device->bo_sizes) snprintf(name, ARRAY_SIZE(name), "vkAllocateMemory(%ldkb)", (long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024)); + VkMemoryPropertyFlags mem_property = + device->physical_device->memory.types[pAllocateInfo->memoryTypeIndex]; result = tu_bo_init_new_explicit_iova( device, &mem->bo, pAllocateInfo->allocationSize, client_address, - alloc_flags, name); + mem_property, alloc_flags, name); } if (result == VK_SUCCESS) { @@ -2761,30 +2787,14 @@ tu_UnmapMemory(VkDevice _device, VkDeviceMemory _memory) /* TODO: unmap here instead of waiting for FreeMemory */ } -VKAPI_ATTR VkResult VKAPI_CALL -tu_FlushMappedMemoryRanges(VkDevice _device, - uint32_t memoryRangeCount, - const VkMappedMemoryRange *pMemoryRanges) -{ - return VK_SUCCESS; -} - -VKAPI_ATTR VkResult VKAPI_CALL -tu_InvalidateMappedMemoryRanges(VkDevice _device, - uint32_t memoryRangeCount, - const VkMappedMemoryRange *pMemoryRanges) -{ - return VK_SUCCESS; -} - static void -tu_get_buffer_memory_requirements(uint64_t size, +tu_get_buffer_memory_requirements(struct tu_device *dev, uint64_t size, VkMemoryRequirements2 *pMemoryRequirements) { pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .size = MAX2(align64(size, 64), size), .alignment = 64, - .memoryTypeBits = 1, + .memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) { @@ -2804,22 +2814,24 @@ tu_get_buffer_memory_requirements(uint64_t size, VKAPI_ATTR void VKAPI_CALL tu_GetBufferMemoryRequirements2( - VkDevice device, + VkDevice _device, const VkBufferMemoryRequirementsInfo2 *pInfo, VkMemoryRequirements2 *pMemoryRequirements) { + TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_buffer, buffer, pInfo->buffer); - tu_get_buffer_memory_requirements(buffer->vk.size, pMemoryRequirements); + tu_get_buffer_memory_requirements(device, buffer->vk.size, pMemoryRequirements); } VKAPI_ATTR void VKAPI_CALL tu_GetDeviceBufferMemoryRequirements( - VkDevice device, + VkDevice _device, const VkDeviceBufferMemoryRequirements *pInfo, VkMemoryRequirements2 *pMemoryRequirements) { - tu_get_buffer_memory_requirements(pInfo->pCreateInfo->size, pMemoryRequirements); + TU_FROM_HANDLE(tu_device, device, _device); + tu_get_buffer_memory_requirements(device, pInfo->pCreateInfo->size, pMemoryRequirements); } VKAPI_ATTR void VKAPI_CALL @@ -3296,8 +3308,10 @@ tu_GetMemoryFdPropertiesKHR(VkDevice _device, int fd, VkMemoryFdPropertiesKHR *pMemoryFdProperties) { + TU_FROM_HANDLE(tu_device, device, _device); assert(handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT); - pMemoryFdProperties->memoryTypeBits = 1; + pMemoryFdProperties->memoryTypeBits = + (1 << device->physical_device->memory.type_count) - 1; return VK_SUCCESS; } diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 1cdf1209109..9dbc65a67a3 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -91,6 +91,14 @@ struct tu_physical_device uint64_t va_start; uint64_t va_size; + bool has_cached_coherent_memory; + bool has_cached_non_coherent_memory; + + struct { + uint32_t type_count; + VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES]; + } memory; + struct fd_dev_id dev_id; const struct fd_dev_info *info; diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc index 093f320d6e9..ca3cc60f7c5 100644 --- a/src/freedreno/vulkan/tu_image.cc +++ b/src/freedreno/vulkan/tu_image.cc @@ -752,13 +752,13 @@ tu_DestroyImage(VkDevice _device, } static void -tu_get_image_memory_requirements(struct tu_image *image, +tu_get_image_memory_requirements(struct tu_device *dev, struct tu_image *image, VkMemoryRequirements2 *pMemoryRequirements) { pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) { .size = image->total_size, .alignment = image->layout[0].base_align, - .memoryTypeBits = 1, + .memoryTypeBits = (1 << dev->physical_device->memory.type_count) - 1, }; vk_foreach_struct(ext, pMemoryRequirements->pNext) { @@ -778,13 +778,14 @@ tu_get_image_memory_requirements(struct tu_image *image, } VKAPI_ATTR void VKAPI_CALL -tu_GetImageMemoryRequirements2(VkDevice device, +tu_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequirementsInfo2 *pInfo, VkMemoryRequirements2 *pMemoryRequirements) { + TU_FROM_HANDLE(tu_device, device, _device); TU_FROM_HANDLE(tu_image, image, pInfo->image); - tu_get_image_memory_requirements(image, pMemoryRequirements); + tu_get_image_memory_requirements(device, image, pMemoryRequirements); } VKAPI_ATTR void VKAPI_CALL @@ -810,7 +811,7 @@ tu_GetDeviceImageMemoryRequirements( tu_image_init(device, &image, pInfo->pCreateInfo, DRM_FORMAT_MOD_INVALID, NULL); - tu_get_image_memory_requirements(&image, pMemoryRequirements); + tu_get_image_memory_requirements(device, &image, pMemoryRequirements); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/freedreno/vulkan/tu_knl.cc b/src/freedreno/vulkan/tu_knl.cc index c953431bb14..549acefc9a7 100644 --- a/src/freedreno/vulkan/tu_knl.cc +++ b/src/freedreno/vulkan/tu_knl.cc @@ -27,9 +27,10 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, uint64_t client_iova, + VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, const char *name) { - return dev->instance->knl->bo_init(dev, out_bo, size, client_iova, flags, name); + return dev->instance->knl->bo_init(dev, out_bo, size, client_iova, mem_property, flags, name); } VkResult diff --git a/src/freedreno/vulkan/tu_knl.h b/src/freedreno/vulkan/tu_knl.h index ede292c744a..41a2bf80996 100644 --- a/src/freedreno/vulkan/tu_knl.h +++ b/src/freedreno/vulkan/tu_knl.h @@ -60,7 +60,8 @@ struct tu_knl { int (*submitqueue_new)(const struct tu_device *dev, int priority, uint32_t *queue_id); void (*submitqueue_close)(const struct tu_device *dev, uint32_t queue_id); VkResult (*bo_init)(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, - uint64_t client_iova, enum tu_bo_alloc_flags flags, const char *name); + uint64_t client_iova, VkMemoryPropertyFlags mem_property, + enum tu_bo_alloc_flags flags, const char *name); VkResult (*bo_init_dmabuf)(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, int prime_fd); int (*bo_export_dmabuf)(struct tu_device *dev, struct tu_bo *bo); @@ -87,13 +88,20 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, uint64_t client_iova, - enum tu_bo_alloc_flags flags, const char *name); + VkMemoryPropertyFlags mem_property, + enum tu_bo_alloc_flags flags, + const char *name); static inline VkResult tu_bo_init_new(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, enum tu_bo_alloc_flags flags, const char *name) { - return tu_bo_init_new_explicit_iova(dev, out_bo, size, 0, flags, name); + return tu_bo_init_new_explicit_iova( + dev, out_bo, size, 0, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, + flags, name); } VkResult diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index c658592d348..f68341e7994 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -133,6 +133,25 @@ tu_drm_get_priorities(const struct tu_physical_device *dev) return val; } +static bool +tu_drm_is_memory_type_supported(int fd, uint32_t flags) +{ + struct drm_msm_gem_new req_alloc = { .size = 0x1000, .flags = flags }; + + int ret = + drmCommandWriteRead(fd, DRM_MSM_GEM_NEW, &req_alloc, sizeof(req_alloc)); + if (ret) { + return false; + } + + struct drm_gem_close req_close = { + .handle = req_alloc.handle, + }; + drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &req_close); + + return true; +} + static int msm_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts) { @@ -387,17 +406,21 @@ msm_bo_init(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, uint64_t client_iova, + VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, const char *name) { - /* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c - * always sets `flags = MSM_BO_WC`, and we copy that behavior here. - */ struct drm_msm_gem_new req = { .size = size, - .flags = MSM_BO_WC + .flags = 0 }; + if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) { + req.flags |= MSM_BO_CACHED_COHERENT; + } else { + req.flags |= MSM_BO_WC; + } + if (flags & TU_BO_ALLOC_GPU_READ_ONLY) req.flags |= MSM_BO_GPU_READONLY; @@ -559,6 +582,22 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo) u_rwlock_rdunlock(&dev->dma_bo_lock); } +VkResult +tu_FlushMappedMemoryRanges(VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange *pMemoryRanges) +{ + return VK_SUCCESS; +} + +VkResult +tu_InvalidateMappedMemoryRanges(VkDevice _device, + uint32_t memoryRangeCount, + const VkMappedMemoryRange *pMemoryRanges) +{ + return VK_SUCCESS; +} + extern const struct vk_sync_type tu_timeline_sync_type; static inline bool @@ -1252,6 +1291,12 @@ tu_knl_drm_msm_load(struct tu_instance *instance, */ device->has_set_iova = false; + /* Even if kernel is new enough, the GPU itself may not support it. */ + device->has_cached_coherent_memory = + (device->msm_minor_version >= 8) && + tu_drm_is_memory_type_supported(fd, MSM_BO_CACHED_COHERENT); + device->has_cached_non_coherent_memory = false; + ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count); if (ret != 0) { result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, diff --git a/src/freedreno/vulkan/tu_knl_kgsl.cc b/src/freedreno/vulkan/tu_knl_kgsl.cc index 85e9f3d281d..796215f1f07 100644 --- a/src/freedreno/vulkan/tu_knl_kgsl.cc +++ b/src/freedreno/vulkan/tu_knl_kgsl.cc @@ -72,6 +72,7 @@ kgsl_bo_init(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, uint64_t client_iova, + VkMemoryPropertyFlags mem_property, enum tu_bo_alloc_flags flags, const char *name) { @@ -81,6 +82,16 @@ kgsl_bo_init(struct tu_device *dev, .size = size, }; + if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) { + if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) { + req.flags |= KGSL_MEMFLAGS_IOCOHERENT; + } + + req.flags |= KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT; + } else { + req.flags |= KGSL_CACHEMODE_WRITECOMBINE << KGSL_CACHEMODE_SHIFT; + } + if (flags & TU_BO_ALLOC_GPU_READ_ONLY) req.flags |= KGSL_MEMFLAGS_GPUREADONLY; @@ -209,6 +220,66 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo) safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req); } +static VkResult +kgsl_sync_cache(VkDevice _device, + uint32_t op, + uint32_t count, + const VkMappedMemoryRange *ranges) +{ + TU_FROM_HANDLE(tu_device, device, _device); + + struct kgsl_gpuobj_sync_obj *sync_list = + (struct kgsl_gpuobj_sync_obj *) vk_zalloc( + &device->vk.alloc, sizeof(*sync_list), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + + struct kgsl_gpuobj_sync gpuobj_sync = { + .objs = (uintptr_t) sync_list, + .obj_len = sizeof(*sync_list), + .count = count, + }; + + for (uint32_t i = 0; i < count; i++) { + TU_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory); + + sync_list[i].op = op; + sync_list[i].id = mem->bo->gem_handle; + sync_list[i].offset = ranges[i].offset; + sync_list[i].length = ranges[i].size == VK_WHOLE_SIZE + ? (mem->bo->size - ranges[i].offset) + : ranges[i].size; + } + + /* There are two other KGSL ioctls for flushing/invalidation: + * - IOCTL_KGSL_GPUMEM_SYNC_CACHE - processes one memory range at a time; + * - IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK - processes several buffers but + * not way to specify ranges. + * + * While IOCTL_KGSL_GPUOBJ_SYNC exactly maps to VK function. + */ + safe_ioctl(device->fd, IOCTL_KGSL_GPUOBJ_SYNC, &gpuobj_sync); + + vk_free(&device->vk.alloc, sync_list); + + return VK_SUCCESS; +} + +VkResult +tu_FlushMappedMemoryRanges(VkDevice device, + uint32_t count, + const VkMappedMemoryRange *ranges) +{ + return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_TO_GPU, count, ranges); +} + +VkResult +tu_InvalidateMappedMemoryRanges(VkDevice device, + uint32_t count, + const VkMappedMemoryRange *ranges) +{ + return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_FROM_GPU, count, ranges); +} + static VkResult get_kgsl_prop(int fd, unsigned int type, void *value, size_t size) { @@ -223,6 +294,26 @@ get_kgsl_prop(int fd, unsigned int type, void *value, size_t size) : VK_SUCCESS; } +static bool +kgsl_is_memory_type_supported(int fd, uint32_t flags) +{ + struct kgsl_gpumem_alloc_id req_alloc = { + .flags = flags, + .size = 0x1000, + }; + + int ret = safe_ioctl(fd, IOCTL_KGSL_GPUMEM_ALLOC_ID, &req_alloc); + if (ret) { + return false; + } + + struct kgsl_gpumem_free_id req_free = { .id = req_alloc.id }; + + safe_ioctl(fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req_free); + + return true; +} + enum kgsl_syncobj_state { KGSL_SYNCOBJ_STATE_UNSIGNALED, KGSL_SYNCOBJ_STATE_SIGNALED, @@ -1169,6 +1260,12 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd) device->heap.used = 0u; device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT; + /* Even if kernel is new enough, the GPU itself may not support it. */ + device->has_cached_coherent_memory = kgsl_is_memory_type_supported( + fd, KGSL_MEMFLAGS_IOCOHERENT | + (KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT)); + device->has_cached_non_coherent_memory = true; + instance->knl = &kgsl_knl_funcs; result = tu_physical_device_init(device, instance);