From 8b8ca028a032bc9b8e9311852333861e297e7ba9 Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Thu, 1 May 2025 09:23:49 +0200 Subject: [PATCH] radv: Return VK_ERROR_INCOMPATIBLE_DRIVER for unsupported devices VK_ERROR_INITIALIZATION_FAILED will fail physical device enumeration. Returning VK_ERROR_INCOMPATIBLE_DRIVER means that the driver can still be used on supported GPUs when multiple GPUs are installed. cc: mesa-stable Reviewed-by: Samuel Pitoiset Part-of: (cherry picked from commit 84b9c281fe82dd66f2552687cecb61a8e22809d0) --- .pick_status.json | 2 +- src/amd/common/ac_gpu_info.c | 41 +++++----- src/amd/common/ac_gpu_info.h | 10 ++- src/amd/vulkan/radv_physical_device.c | 12 ++- .../vulkan/winsys/amdgpu/radv_amdgpu_winsys.c | 74 ++++++++++--------- .../winsys/amdgpu/radv_amdgpu_winsys_public.h | 4 +- src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 2 +- 7 files changed, 81 insertions(+), 64 deletions(-) diff --git a/.pick_status.json b/.pick_status.json index 3e4d39bb462..54fae9fd1ec 100644 --- a/.pick_status.json +++ b/.pick_status.json @@ -4,7 +4,7 @@ "description": "radv: Return VK_ERROR_INCOMPATIBLE_DRIVER for unsupported devices", "nominated": true, "nomination_type": 1, - "resolution": 0, + "resolution": 1, "main_sha": null, "because_sha": null, "notes": null diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 9c47659282e..31ab81d432f 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -510,8 +510,9 @@ static void handle_env_var_force_family(struct radeon_info *info) exit(1); } -bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, - bool require_pci_bus_info) +enum ac_query_gpu_info_result +ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, + bool require_pci_bus_info) { struct amdgpu_gpu_info amdinfo; struct drm_amdgpu_info_device device_info = {0}; @@ -535,7 +536,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, if (!ac_query_pci_bus_info(fd, info)) { if (require_pci_bus_info) - return false; + return AC_QUERY_GPU_INFO_FAIL; } assert(info->drm_major == 3); @@ -545,27 +546,27 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, fprintf(stderr, "amdgpu: DRM version is %u.%u.%u, but this driver is " "only compatible with 3.42.0 (kernel 5.15+) or later.\n", info->drm_major, info->drm_minor, info->drm_patchlevel); - return false; + return AC_QUERY_GPU_INFO_FAIL; } uint64_t cap; r = drmGetCap(fd, DRM_CAP_SYNCOBJ, &cap); if (r != 0 || cap == 0) { fprintf(stderr, "amdgpu: syncobj support is missing but is required.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } /* Query hardware and driver information. */ r = ac_drm_query_gpu_info(dev, &amdinfo); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_gpu_info failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } r = ac_drm_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_info(dev_info) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) { @@ -633,35 +634,35 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, /* Only require gfx or compute. */ if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) { fprintf(stderr, "amdgpu: failed to find gfx or compute.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version, &info->me_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(me) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_MEC, 0, 0, &info->mec_fw_version, &info->mec_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(mec) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version, &info->pfp_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(pfp) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } if (info->ip[AMD_IP_VCN_DEC].num_queues || info->ip[AMD_IP_VCN_UNIFIED].num_queues) { r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCN, 0, 0, &vidip_fw_version, &vidip_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vcn) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } else { info->vcn_dec_version = (vidip_fw_version & 0x0F000000) >> 24; info->vcn_enc_major_version = (vidip_fw_version & 0x00F00000) >> 20; @@ -672,7 +673,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vidip_fw_version, &vidip_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vce) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } else info->vce_fw_version = vidip_fw_version; } @@ -681,7 +682,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &vidip_fw_version, &vidip_fw_feature); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(uvd) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } else info->uvd_fw_version = vidip_fw_version; } @@ -690,7 +691,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, r = ac_drm_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi); if (r) { fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } struct drm_amdgpu_memory_info meminfo = {0}; @@ -698,7 +699,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, r = ac_drm_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo); if (r) { fprintf(stderr, "amdgpu: ac_drm_query_info(memory) failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } /* Note: usable_heap_size values can be random and can't be relied on. */ @@ -837,7 +838,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, else { fprintf(stderr, "amdgpu: Unknown gfx version: %u.%u\n", info->ip[AMD_IP_GFX].ver_major, info->ip[AMD_IP_GFX].ver_minor); - return false; + return AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW; } info->family_id = device_info.family; @@ -852,7 +853,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, if (!info->name) { fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n", device_info.family, device_info.external_rev); - return false; + return AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW; } memset(info->lowercase_name, 0, sizeof(info->lowercase_name)); @@ -1682,7 +1683,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, r = ac_drm_query_uq_fw_area_info(dev, AMDGPU_HW_IP_GFX, 0, &fw_info); if (r) { fprintf(stderr, "amdgpu: amdgpu_query_uq_fw_area_info() failed.\n"); - return false; + return AC_QUERY_GPU_INFO_FAIL; } info->has_fw_based_shadowing = true; @@ -1747,7 +1748,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, exit(0); } } - return true; + return AC_QUERY_GPU_INFO_SUCCESS; } void ac_compute_driver_uuid(char *uuid, size_t size) diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 8056f452f33..c1744c42b50 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -328,8 +328,14 @@ struct radeon_info { bool has_image_bvh_intersect_ray; }; -bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, - bool require_pci_bus_info); +enum ac_query_gpu_info_result { + AC_QUERY_GPU_INFO_SUCCESS, + AC_QUERY_GPU_INFO_FAIL, + AC_QUERY_GPU_INFO_UNIMPLEMENTED_HW, +}; + +enum ac_query_gpu_info_result ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, + bool require_pci_bus_info); void ac_compute_driver_uuid(char *uuid, size_t size); diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index a81a63f36c2..3db3353b6c2 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -2062,19 +2062,23 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm #ifdef _WIN32 pdev->ws = radv_null_winsys_create(); + if (!pdev->ws) + result = VK_ERROR_OUT_OF_HOST_MEMORY; #else if (drm_device) { bool reserve_vmid = instance->vk.trace_mode & RADV_TRACE_MODE_RGP; - pdev->ws = - radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags, reserve_vmid, is_virtio); + result = radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags, reserve_vmid, is_virtio, + &pdev->ws); } else { pdev->ws = radv_null_winsys_create(); + if (!pdev->ws) + result = VK_ERROR_OUT_OF_HOST_MEMORY; } #endif - if (!pdev->ws) { - result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, "failed to initialize winsys"); + if (result != VK_SUCCESS) { + result = vk_errorf(instance, result, "failed to initialize winsys"); goto fail_base; } diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c index dee024d4359..a2ebb948e9d 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c @@ -22,31 +22,6 @@ #include "vk_drm_syncobj.h" #include "xf86drm.h" -static bool -do_winsys_init(struct radv_amdgpu_winsys *ws, int fd) -{ - if (!ac_query_gpu_info(fd, ws->dev, &ws->info, true)) - return false; - - /* - * Override the max submits on video queues. - * If you submit multiple session contexts in the same IB sequence the - * hardware gets upset as it expects a kernel fence to be emitted to reset - * the session context in the hardware. - * Avoid this problem by never submitted more than one IB at a time. - * This possibly should be fixed in the kernel, and if it is this can be - * resolved. - */ - for (enum amd_ip_type ip_type = AMD_IP_UVD; ip_type <= AMD_IP_VCN_ENC; ip_type++) - ws->info.max_submitted_ibs[ip_type] = 1; - - ws->info.ip[AMD_IP_SDMA].num_queues = MIN2(ws->info.ip[AMD_IP_SDMA].num_queues, MAX_RINGS_PER_TYPE); - ws->info.ip[AMD_IP_COMPUTE].num_queues = MIN2(ws->info.ip[AMD_IP_COMPUTE].num_queues, MAX_RINGS_PER_TYPE); - - ws->use_ib_bos = true; - return true; -} - static void radv_amdgpu_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *gpu_info) { @@ -195,9 +170,12 @@ radv_amdgpu_winsys_get_sync_types(struct radeon_winsys *rws) return ws->sync_types; } -struct radeon_winsys * -radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, bool reserve_vmid, bool is_virtio) +VkResult +radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, bool reserve_vmid, bool is_virtio, + struct radeon_winsys **winsys) { + VkResult result = VK_SUCCESS; + uint32_t drm_major, drm_minor, r; ac_drm_device *dev; struct radv_amdgpu_winsys *ws = NULL; @@ -205,7 +183,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, r = ac_drm_device_initialize(fd, is_virtio, &drm_major, &drm_minor, &dev); if (r) { fprintf(stderr, "radv/amdgpu: failed to initialize device.\n"); - return NULL; + return VK_ERROR_INITIALIZATION_FAILED; } /* We have to keep this lock till insertion. */ @@ -214,6 +192,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, winsyses = _mesa_pointer_hash_table_create(NULL); if (!winsyses) { fprintf(stderr, "radv/amdgpu: failed to alloc winsys hash table.\n"); + result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail; } @@ -232,19 +211,22 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, ((debug_flags & RADV_DEBUG_HANG) && !ws->debug_log_bos) || ((debug_flags & RADV_DEBUG_NO_IBS) && ws->use_ib_bos) || (perftest_flags != ws->perftest)) { fprintf(stderr, "radv/amdgpu: Found options that differ from the existing winsys.\n"); - return NULL; + return VK_ERROR_INITIALIZATION_FAILED; } /* RADV_DEBUG_ZERO_VRAM is the only option that is allowed to be set again. */ if (debug_flags & RADV_DEBUG_ZERO_VRAM) ws->zero_all_vram_allocs = true; - return &ws->base; + *winsys = &ws->base; + return VK_SUCCESS; } ws = calloc(1, sizeof(struct radv_amdgpu_winsys)); - if (!ws) + if (!ws) { + result = VK_ERROR_OUT_OF_HOST_MEMORY; goto fail; + } ws->refcount = 1; ws->dev = dev; @@ -252,8 +234,29 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, ws->info.drm_major = drm_major; ws->info.drm_minor = drm_minor; ws->info.is_virtio = is_virtio; - if (!do_winsys_init(ws, fd)) + + enum ac_query_gpu_info_result info_result = ac_query_gpu_info(fd, ws->dev, &ws->info, true); + if (info_result != AC_QUERY_GPU_INFO_SUCCESS) { + result = info_result == AC_QUERY_GPU_INFO_FAIL ? VK_ERROR_INITIALIZATION_FAILED : VK_ERROR_INCOMPATIBLE_DRIVER; goto winsys_fail; + } + + /* + * Override the max submits on video queues. + * If you submit multiple session contexts in the same IB sequence the + * hardware gets upset as it expects a kernel fence to be emitted to reset + * the session context in the hardware. + * Avoid this problem by never submitted more than one IB at a time. + * This possibly should be fixed in the kernel, and if it is this can be + * resolved. + */ + for (enum amd_ip_type ip_type = AMD_IP_UVD; ip_type <= AMD_IP_VCN_ENC; ip_type++) + ws->info.max_submitted_ibs[ip_type] = 1; + + ws->info.ip[AMD_IP_SDMA].num_queues = MIN2(ws->info.ip[AMD_IP_SDMA].num_queues, MAX_RINGS_PER_TYPE); + ws->info.ip[AMD_IP_COMPUTE].num_queues = MIN2(ws->info.ip[AMD_IP_COMPUTE].num_queues, MAX_RINGS_PER_TYPE); + + ws->use_ib_bos = true; ws->debug_all_bos = !!(debug_flags & RADV_DEBUG_ALL_BOS); ws->debug_log_bos = debug_flags & RADV_DEBUG_HANG; @@ -265,6 +268,7 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, r = ac_drm_vm_reserve_vmid(ws->dev, 0); if (r) { fprintf(stderr, "radv/amdgpu: failed to reserve vmid.\n"); + result = VK_ERROR_INITIALIZATION_FAILED; goto winsys_fail; } } @@ -311,7 +315,9 @@ radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, _mesa_hash_table_insert(winsyses, (void *)ac_drm_device_get_cookie(dev), ws); simple_mtx_unlock(&winsys_creation_mutex); - return &ws->base; + *winsys = &ws->base; + + return result; winsys_fail: free(ws); @@ -322,5 +328,5 @@ fail: } simple_mtx_unlock(&winsys_creation_mutex); ac_drm_device_deinitialize(dev); - return NULL; + return result; } diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h index 9e0dc71d3bc..438d9285b65 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys_public.h @@ -12,8 +12,8 @@ #ifndef RADV_AMDGPU_WINSYS_PUBLIC_H #define RADV_AMDGPU_WINSYS_PUBLIC_H -struct radeon_winsys *radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, - bool reserve_vmid, bool is_virtio); +VkResult radv_amdgpu_winsys_create(int fd, uint64_t debug_flags, uint64_t perftest_flags, bool reserve_vmid, + bool is_virtio, struct radeon_winsys **winsys); struct radeon_winsys *radv_dummy_winsys_create(void); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index d825b525d2c..433ffaa0f1b 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -36,7 +36,7 @@ static bool do_winsys_init(struct amdgpu_winsys *aws, const struct pipe_screen_config *config, int fd) { - if (!ac_query_gpu_info(fd, aws->dev, &aws->info, false)) + if (ac_query_gpu_info(fd, aws->dev, &aws->info, false) != AC_QUERY_GPU_INFO_SUCCESS) goto fail; aws->addrlib = ac_addrlib_create(&aws->info, &aws->info.max_alignment);