diff --git a/src/freedreno/vulkan/meson.build b/src/freedreno/vulkan/meson.build index 298d81f5a4b..fc7d5cf13e9 100644 --- a/src/freedreno/vulkan/meson.build +++ b/src/freedreno/vulkan/meson.build @@ -39,6 +39,13 @@ libtu_files = files( 'tu_util.cc', ) +libtu_includes = [ + inc_include, + inc_src, + inc_compiler, + inc_freedreno, +] + tu_deps = [] tu_flags = [] @@ -82,6 +89,13 @@ if freedreno_kmds.contains('msm') tu_deps += dep_libdrm endif +if freedreno_kmds.contains('virtio') + tu_flags += '-DTU_HAS_VIRTIO' + libtu_files += files('tu_knl_drm_virtio.cc', 'tu_knl_drm.cc') + libtu_includes += inc_virtio_gpu + tu_deps += dep_libdrm +endif + tu_tracepoints = custom_target( 'tu_tracepoints.[ch]', input: 'tu_tracepoints.py', @@ -134,12 +148,7 @@ endif libvulkan_freedreno = shared_library( 'vulkan_freedreno', [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h, u_format_pack_h], - include_directories : [ - inc_include, - inc_src, - inc_compiler, - inc_freedreno, - ], + include_directories : libtu_includes, link_with : [ libfreedreno_ir3, libfreedreno_layout, diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 1767f5ed537..3f3f88d378a 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -1579,6 +1579,7 @@ tu_queue_init(struct tu_device *device, return result; queue->device = device; + queue->priority = priority; queue->vk.driver_submit = tu_queue_submit; int ret = tu_drm_submitqueue_new(device, priority, &queue->msm_queue_id); diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index ebdc7d2d7b6..4a7422f068a 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -153,6 +153,7 @@ struct tu_queue struct tu_device *device; uint32_t msm_queue_id; + uint32_t priority; int fence; /* timestamp/fence of the last queue submission */ }; @@ -201,6 +202,9 @@ struct tu6_global volatile uint32_t breadcrumb_cpu_sync_seqno; uint32_t _pad4; + volatile uint32_t userspace_fence; + uint32_t _pad5; + /* note: larger global bo will be used for customBorderColors */ struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[]; }; @@ -224,6 +228,8 @@ enum tu_gralloc_type }; #endif +struct tu_virtio_device; + struct tu_device { struct vk_device vk; @@ -349,6 +355,10 @@ struct tu_device enum tu_gralloc_type gralloc_type; #endif +#ifdef TU_HAS_VIRTIO + struct tu_virtio_device *vdev; +#endif + uint32_t submit_count; /* Address space and global fault count for this local_fd with DRM backend */ diff --git a/src/freedreno/vulkan/tu_knl.cc b/src/freedreno/vulkan/tu_knl.cc index 9ffb3a6f9f7..365b94aa394 100644 --- a/src/freedreno/vulkan/tu_knl.cc +++ b/src/freedreno/vulkan/tu_knl.cc @@ -173,8 +173,10 @@ tu_physical_device_try_create(struct vk_instance *vk_instance, struct tu_instance *instance = container_of(vk_instance, struct tu_instance, vk); - if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) || - drm_device->bustype != DRM_BUS_PLATFORM) + /* Note that "msm" is a platform device, but "virtio_gpu" is a pci + * device. In general we shouldn't care about the bus type. + */ + if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER))) return VK_ERROR_INCOMPATIBLE_DRIVER; const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY]; @@ -203,6 +205,10 @@ tu_physical_device_try_create(struct vk_instance *vk_instance, if (strcmp(version->name, "msm") == 0) { #ifdef TU_HAS_MSM result = tu_knl_drm_msm_load(instance, fd, version, &device); +#endif + } else if (strcmp(version->name, "virtio_gpu") == 0) { +#ifdef TU_HAS_VIRTIO + result = tu_knl_drm_virtio_load(instance, fd, version, &device); #endif } else { result = vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, diff --git a/src/freedreno/vulkan/tu_knl.h b/src/freedreno/vulkan/tu_knl.h index ae098939c55..fccfccadd2b 100644 --- a/src/freedreno/vulkan/tu_knl.h +++ b/src/freedreno/vulkan/tu_knl.h @@ -40,6 +40,14 @@ enum tu_timeline_sync_state { struct tu_bo { uint32_t gem_handle; +#ifdef TU_HAS_VIRTIO + /* Between the guest UMD and host native-ctx shim/proxy, guest kernel + * assigned res_id is used instead of host gem handle. This allows + * the guest to run ahead of the host without having to wait for + * response from the host when buffers are allocated. + */ + uint32_t res_id; +#endif uint64_t size; uint64_t iova; void *map; @@ -81,6 +89,9 @@ struct tu_knl { struct tu_zombie_vma { int fence; uint32_t gem_handle; +#ifdef TU_HAS_VIRTIO + uint32_t res_id; +#endif uint64_t iova; uint64_t size; }; @@ -105,6 +116,8 @@ static inline VkResult tu_bo_init_new(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size, enum tu_bo_alloc_flags flags, const char *name) { + // TODO don't mark everything with HOST_VISIBLE !!! Anything that + // never gets CPU access should not have this bit set return tu_bo_init_new_explicit_iova( dev, out_bo, size, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | @@ -143,6 +156,9 @@ struct _drmVersion; VkResult tu_knl_drm_msm_load(struct tu_instance *instance, int fd, struct _drmVersion *version, struct tu_physical_device **out); +VkResult tu_knl_drm_virtio_load(struct tu_instance *instance, + int fd, struct _drmVersion *version, + struct tu_physical_device **out); VkResult tu_enumerate_devices(struct vk_instance *vk_instance); diff --git a/src/freedreno/vulkan/tu_knl_drm_virtio.cc b/src/freedreno/vulkan/tu_knl_drm_virtio.cc new file mode 100644 index 00000000000..bc128afee68 --- /dev/null +++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc @@ -0,0 +1,1614 @@ +/* + * Copyright © 2018 Google, Inc. + * Copyright © 2015 Intel Corporation + * SPDX-License-Identifier: MIT + * + * Kernel interface layer for turnip running on virtio_gpu (aka virtgpu) + */ + +#include "tu_knl.h" + +#include +#include +#include +#include +#include + +#include "vk_util.h" + +#include "drm-uapi/msm_drm.h" +#include "drm-uapi/virtgpu_drm.h" +#include "util/u_debug.h" +#include "util/hash_table.h" +#include "util/libsync.h" +#include "util/u_process.h" + +#include "tu_cmd_buffer.h" +#include "tu_cs.h" +#include "tu_device.h" +#include "tu_dynamic_rendering.h" +#include "tu_knl_drm.h" + +#define VIRGL_RENDERER_UNSTABLE_APIS 1 +#include "virglrenderer_hw.h" +#include "msm_proto.h" + +struct tu_userspace_fence_cmd { + uint32_t pkt[4]; /* first 4 dwords of packet */ + uint32_t fence; /* fifth dword is fence value which is plugged in at runtime */ + uint32_t _pad[11]; +}; + +struct tu_userspace_fence_cmds { + struct tu_userspace_fence_cmd cmds[64]; +}; + +struct tu_queue_submit { + struct vk_queue_submit *vk_submit; + struct tu_u_trace_submission_data *u_trace_submission_data; + + struct tu_cmd_buffer **cmd_buffers; + struct drm_msm_gem_submit_cmd *cmds; + struct drm_virtgpu_execbuffer_syncobj *in_syncobjs; + struct drm_virtgpu_execbuffer_syncobj *out_syncobjs; + + uint32_t nr_cmd_buffers; + uint32_t nr_in_syncobjs; + uint32_t nr_out_syncobjs; + uint32_t entry_count; + uint32_t perf_pass_index; + + bool autotune_fence; +}; + +struct tu_u_trace_syncobj { + uint32_t msm_queue_id; + uint32_t fence; +}; + +struct tu_virtio_device { + uint32_t shmem_handle; + struct msm_shmem *shmem; + uint8_t *rsp_mem; + uint32_t rsp_mem_len; + uint32_t next_rsp_off; + simple_mtx_t rsp_lock; + simple_mtx_t eb_lock; + + uint32_t next_blob_id; + uint32_t next_seqno; + + /* + * Buffering for requests to host: + */ + uint32_t reqbuf_len; + uint32_t reqbuf_cnt; + uint8_t reqbuf[0x4000]; + + struct tu_userspace_fence_cmds *fence_cmds; + struct tu_bo *fence_cmds_mem; + + /** + * Processing zombie VMAs is a two step process, first we clear the iova + * and then we close the handles. But to minimize waste of virtqueue + * space (and associated stalling and ping-ponging between guest and host) + * we want to batch up all the GEM_SET_IOVA ccmds before we flush them to + * the host and start closing handles. + * + * This gives us a place to stash the VMAs between the two steps. + */ + struct u_vector zombie_vmas_stage_2; +}; + +#define virtio_ioctl(fd, name, args) ({ \ + MESA_TRACE_BEGIN(#name); \ + int ret = drmIoctl((fd), DRM_IOCTL_ ## name, (args)); \ + MESA_TRACE_END(); \ + ret; \ + }) + +static int tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value); +static VkResult virtio_bo_map_handle(struct tu_device *dev, uint32_t handle, + uint32_t size, void **map); +static void tu_gem_close(struct tu_device *dev, uint32_t gem_handle); + +static int +get_capset(int fd, struct virgl_renderer_capset_drm *caps) +{ + struct drm_virtgpu_get_caps args = { + .cap_set_id = VIRGL_RENDERER_CAPSET_DRM, + .cap_set_ver = 0, + .addr = (uint64_t)(uintptr_t)caps, + .size = sizeof(*caps), + }; + + memset(caps, 0, sizeof(*caps)); + + return virtio_ioctl(fd, VIRTGPU_GET_CAPS, &args); +} + +static int +set_context(int fd) +{ + struct drm_virtgpu_context_set_param params[] = { + { VIRTGPU_CONTEXT_PARAM_CAPSET_ID, VIRGL_RENDERER_CAPSET_DRM }, + { VIRTGPU_CONTEXT_PARAM_NUM_RINGS, 64 }, + }; + struct drm_virtgpu_context_init args = { + .num_params = ARRAY_SIZE(params), + .ctx_set_params = (uint64_t)(uintptr_t)params, + }; + + return virtio_ioctl(fd, VIRTGPU_CONTEXT_INIT, &args); +} + +static int +execbuf_locked(struct tu_device *device, void *cmd, uint32_t cmd_size, + struct tu_queue_submit *submit, int *out_fence_fd, int ring_idx) +{ + struct drm_virtgpu_execbuffer eb = { + .flags = COND(out_fence_fd, VIRTGPU_EXECBUF_FENCE_FD_OUT) | + VIRTGPU_EXECBUF_RING_IDX, + .size = cmd_size, + .command = (uint64_t)(uintptr_t)cmd, + .ring_idx = ring_idx, + }; + + simple_mtx_assert_locked(&device->vdev->eb_lock); + + if (submit) { + eb.in_syncobjs = (uint64_t)(uintptr_t)submit->in_syncobjs; + eb.out_syncobjs = (uint64_t)(uintptr_t)submit->out_syncobjs; + eb.num_in_syncobjs = submit->nr_in_syncobjs; + eb.num_out_syncobjs = submit->nr_out_syncobjs; + eb.syncobj_stride = sizeof(struct drm_virtgpu_execbuffer_syncobj); + } + + int ret = virtio_ioctl(device->fd, VIRTGPU_EXECBUFFER, &eb); + if (ret) + return ret; + + if (out_fence_fd) + *out_fence_fd = eb.fence_fd; + + return 0; +} + +static int +execbuf_flush_locked(struct tu_device *device, int *out_fence_fd) +{ + struct tu_virtio_device *vdev = device->vdev; + + simple_mtx_assert_locked(&device->vdev->eb_lock); + + if (!vdev->reqbuf_len) + return 0; + + int ret = execbuf_locked(device, vdev->reqbuf, vdev->reqbuf_len, NULL, out_fence_fd, 0); + if (ret) + return ret; + + vdev->reqbuf_len = 0; + vdev->reqbuf_cnt = 0; + + return 0; +} + +static int +execbuf_flush(struct tu_device *device) +{ + struct tu_virtio_device *vdev = device->vdev; + simple_mtx_lock(&vdev->eb_lock); + int ret = execbuf_flush_locked(device, NULL); + simple_mtx_unlock(&vdev->eb_lock); + return ret; +} + +static int +send_ccmd(struct tu_device *device, struct msm_ccmd_req *req, bool sync) +{ + MESA_TRACE_FUNC(); + struct tu_virtio_device *vdev = device->vdev; + int fence_fd, ret = 0; + + simple_mtx_lock(&vdev->eb_lock); + req->seqno = ++vdev->next_seqno; + + if ((vdev->reqbuf_len + req->len) > sizeof(vdev->reqbuf)) { + ret = execbuf_flush_locked(device, NULL); + if (ret) + goto out_unlock; + } + + memcpy(&vdev->reqbuf[vdev->reqbuf_len], req, req->len); + vdev->reqbuf_len += req->len; + vdev->reqbuf_cnt++; + + if (!sync) + goto out_unlock; + + ret = execbuf_flush_locked(device, &fence_fd); + +out_unlock: + simple_mtx_unlock(&vdev->eb_lock); + + if (ret) + return ret; + + if (sync) { + MESA_TRACE_BEGIN("virtio_execbuf sync"); + sync_wait(fence_fd, -1); + close(fence_fd); + MESA_TRACE_END(); + } + + return 0; +} + +static void * +virtio_alloc_rsp(struct tu_device *dev, struct msm_ccmd_req *req, uint32_t sz) +{ + struct tu_virtio_device *vdev = dev->vdev; + unsigned off; + + simple_mtx_lock(&vdev->rsp_lock); + + sz = align(sz, 8); + + if ((vdev->next_rsp_off + sz) >= vdev->rsp_mem_len) + vdev->next_rsp_off = 0; + + off = vdev->next_rsp_off; + vdev->next_rsp_off += sz; + + simple_mtx_unlock(&vdev->rsp_lock); + + req->rsp_off = off; + + struct msm_ccmd_rsp *rsp = (struct msm_ccmd_rsp *)&vdev->rsp_mem[off]; + rsp->len = sz; + + return rsp; +} + +/** + * Helper for simple pass-thru ioctls + */ +static int +virtio_simple_ioctl(struct tu_device *dev, unsigned cmd, void *_req) +{ + MESA_TRACE_FUNC(); + unsigned req_len = sizeof(struct msm_ccmd_ioctl_simple_req); + unsigned rsp_len = sizeof(struct msm_ccmd_ioctl_simple_rsp); + + req_len += _IOC_SIZE(cmd); + if (cmd & IOC_OUT) + rsp_len += _IOC_SIZE(cmd); + + uint8_t buf[req_len]; + struct msm_ccmd_ioctl_simple_req *req = (struct msm_ccmd_ioctl_simple_req *)buf; + struct msm_ccmd_ioctl_simple_rsp *rsp; + + req->hdr = MSM_CCMD(IOCTL_SIMPLE, req_len); + req->cmd = cmd; + memcpy(req->payload, _req, _IOC_SIZE(cmd)); + + rsp = (struct msm_ccmd_ioctl_simple_rsp *) + virtio_alloc_rsp(dev, &req->hdr, rsp_len); + + int ret = send_ccmd(dev, &req->hdr, true); + + if (cmd & IOC_OUT) + memcpy(_req, rsp->payload, _IOC_SIZE(cmd)); + + ret = rsp->ret; + + return ret; +} + +static int +set_iova(struct tu_device *device, uint32_t res_id, uint64_t iova) +{ + struct msm_ccmd_gem_set_iova_req req = { + .hdr = MSM_CCMD(GEM_SET_IOVA, sizeof(req)), + .iova = iova, + .res_id = res_id, + }; + + return send_ccmd(device, &req.hdr, false); +} + +#define SHMEM_SZ 0x4000 + +static VkResult +init_shmem(struct tu_device *dev, struct tu_virtio_device *vdev) +{ + struct tu_instance *instance = dev->physical_device->instance; + struct drm_virtgpu_resource_create_blob args = { + .blob_mem = VIRTGPU_BLOB_MEM_HOST3D, + .blob_flags = VIRTGPU_BLOB_FLAG_USE_MAPPABLE, + .size = SHMEM_SZ, + .blob_id = 0, + }; + VkResult result; + int ret; + + ret = virtio_ioctl(dev->fd, VIRTGPU_RESOURCE_CREATE_BLOB, &args); + if (ret) { + return vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "failed to allocate shmem buffer"); + } + + vdev->shmem_handle = args.bo_handle; + + result = virtio_bo_map_handle(dev, vdev->shmem_handle, args.size, + (void **)&vdev->shmem); + if (result != VK_SUCCESS) { + tu_gem_close(dev, vdev->shmem_handle); + vdev->shmem_handle = 0; + return vk_startup_errorf(instance, result, "failed to map shmem buffer"); + } + + uint32_t offset = vdev->shmem->rsp_mem_offset; + vdev->rsp_mem_len = args.size - offset; + vdev->rsp_mem = &((uint8_t *)vdev->shmem)[offset]; + + return VK_SUCCESS; +} + +static int +query_faults(struct tu_device *dev, uint64_t *value) +{ + struct tu_virtio_device *vdev = dev->vdev; + uint32_t async_error = 0; + uint64_t global_faults; + + if (msm_shmem_has_field(vdev->shmem, async_error)) + async_error = vdev->shmem->async_error; + + if (msm_shmem_has_field(vdev->shmem, global_faults)) { + global_faults = vdev->shmem->global_faults; + } else { + int ret = tu_drm_get_param(dev, MSM_PARAM_FAULTS, &global_faults); + if (ret) + return ret; + } + + *value = global_faults + async_error; + + return 0; +} + +static void +set_debuginfo(struct tu_device *dev) +{ + const char *comm = util_get_process_name(); + static char cmdline[0x1000+1]; + int fd = open("/proc/self/cmdline", O_RDONLY); + if (fd < 0) + return; + + int n = read(fd, cmdline, sizeof(cmdline) - 1); + if (n < 0) + return; + + /* arguments are separated by NULL, convert to spaces: */ + for (int i = 0; i < n; i++) { + if (cmdline[i] == '\0') { + cmdline[i] = ' '; + } + } + + cmdline[n] = '\0'; + + unsigned comm_len = strlen(comm) + 1; + unsigned cmdline_len = strlen(cmdline) + 1; + + struct msm_ccmd_set_debuginfo_req *req; + + unsigned req_len = align(sizeof(*req) + comm_len + cmdline_len, 4); + + req = (struct msm_ccmd_set_debuginfo_req *)malloc(req_len); + + req->hdr = MSM_CCMD(SET_DEBUGINFO, req_len); + req->comm_len = comm_len; + req->cmdline_len = cmdline_len; + + memcpy(&req->payload[0], comm, comm_len); + memcpy(&req->payload[comm_len], cmdline, cmdline_len); + + send_ccmd(dev, &req->hdr, false); + + free(req); +} + +static VkResult +virtio_device_init(struct tu_device *dev) +{ + struct tu_instance *instance = dev->physical_device->instance; + VkResult result; + int fd; + + fd = open(dev->physical_device->fd_path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + return vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED, + "failed to open device %s", dev->physical_device->fd_path); + } + + /* We don't need to get capset again, we already know we are good there + * from the initial device file open. But we still need to set the ctx + * type on the device fd that we'll _actually_ be using.. + */ + if (set_context(fd)) { + close(fd); + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "Could not set context type: %s", strerror(errno)); + } + + struct tu_virtio_device *vdev = (struct tu_virtio_device *) + vk_zalloc(&instance->vk.alloc, sizeof(*vdev), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!vdev) { + close(fd); + return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + }; + + simple_mtx_init(&vdev->rsp_lock, mtx_plain); + simple_mtx_init(&vdev->eb_lock, mtx_plain); + + u_vector_init(&vdev->zombie_vmas_stage_2, 64, sizeof(struct tu_zombie_vma)); + + dev->vdev = vdev; + dev->fd = fd; + + result = init_shmem(dev, vdev); + if (result != VK_SUCCESS) { + vk_free(&instance->vk.alloc, vdev); + close(fd); + return result; + } + + query_faults(dev, &dev->fault_count); + + set_debuginfo(dev); + + return VK_SUCCESS; +} + +static void +virtio_device_finish(struct tu_device *dev) +{ + struct tu_instance *instance = dev->physical_device->instance; + struct tu_virtio_device *vdev = dev->vdev; + + u_vector_finish(&vdev->zombie_vmas_stage_2); + + munmap(vdev->shmem, SHMEM_SZ); + tu_gem_close(dev, vdev->shmem_handle); + + vk_free(&instance->vk.alloc, vdev); + dev->vdev = NULL; + + close(dev->fd); +} + +static int +tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value) +{ + /* Technically this requires a pipe, but the kernel only supports one pipe + * anyway at the time of writing and most of these are clearly pipe + * independent. */ + struct drm_msm_param req = { + .pipe = MSM_PIPE_3D0, + .param = param, + }; + + int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_GET_PARAM, &req); + if (ret) + return ret; + + *value = req.value; + + return 0; +} + +static int +virtio_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts) +{ + return tu_drm_get_param(dev, MSM_PARAM_TIMESTAMP, ts); +} + +static int +virtio_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count) +{ + int ret = tu_drm_get_param(dev, MSM_PARAM_SUSPENDS, suspend_count); + return ret; +} + +static VkResult +virtio_device_check_status(struct tu_device *device) +{ + uint64_t last_fault_count = device->fault_count; + + query_faults(device, &device->fault_count); + + if (last_fault_count != device->fault_count) + return vk_device_set_lost(&device->vk, "GPU faulted or hung"); + + return VK_SUCCESS; +} + +static int +virtio_submitqueue_new(struct tu_device *dev, + int priority, + uint32_t *queue_id) +{ + assert(priority >= 0 && + priority < dev->physical_device->submitqueue_priority_count); + + struct drm_msm_submitqueue req = { + .flags = 0, + .prio = priority, + }; + + int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_NEW, &req); + if (ret) + return ret; + + *queue_id = req.id; + return 0; +} + +static void +virtio_submitqueue_close(struct tu_device *dev, uint32_t queue_id) +{ + virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE, &queue_id); +} + +static void +tu_gem_close(struct tu_device *dev, uint32_t gem_handle) +{ + struct drm_gem_close req = { + .handle = gem_handle, + }; + + /* Need to flush buffered SET_IOVA cmds before closing handles: */ + execbuf_flush(dev); + + drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req); +} + +static VkResult +tu_wait_fence(struct tu_device *dev, + uint32_t queue_id, + int fence, + uint64_t timeout_ns) +{ + if (!fence_before(dev->global_bo_map->userspace_fence, fence)) + return VK_SUCCESS; + + if (!timeout_ns) + return VK_TIMEOUT; + + MESA_TRACE_FUNC(); + + struct msm_ccmd_wait_fence_req req = { + .hdr = MSM_CCMD(WAIT_FENCE, sizeof(req)), + .queue_id = queue_id, + .fence = fence, + }; + struct msm_ccmd_submitqueue_query_rsp *rsp; + int64_t end_time = os_time_get_nano() + timeout_ns; + int ret; + + do { + rsp = (struct msm_ccmd_submitqueue_query_rsp *) + virtio_alloc_rsp(dev, &req.hdr, sizeof(*rsp)); + + ret = send_ccmd(dev, &req.hdr, true); + if (ret) + goto out; + + if (os_time_get_nano() >= end_time) + break; + + ret = rsp->ret; + } while (ret == -ETIMEDOUT); + +out: + if (!ret) return VK_SUCCESS; + if (ret == -ETIMEDOUT) return VK_TIMEOUT; + return VK_ERROR_UNKNOWN; +} + +static VkResult +tu_free_zombie_vma_locked(struct tu_device *dev, bool wait) +{ + struct tu_virtio_device *vdev = dev->vdev; + + if (!u_vector_length(&dev->zombie_vmas)) + return VK_SUCCESS; + + if (wait) { + struct tu_zombie_vma *vma = (struct tu_zombie_vma *) + u_vector_head(&dev->zombie_vmas); + /* Wait for 3s (arbitrary timeout) */ + VkResult ret = tu_wait_fence(dev, dev->queues[0]->msm_queue_id, + vma->fence, 3000000000); + + if (ret != VK_SUCCESS) + return ret; + } + + /* Clear the iova of all finished objects in first pass so the SET_IOVA + * ccmd's can be buffered and sent together to the host. *Then* delete + * the handles. This avoids filling up the virtqueue with tiny messages, + * since each execbuf ends up needing to be page aligned. + */ + int last_signaled_fence = -1; + while (u_vector_length(&dev->zombie_vmas) > 0) { + struct tu_zombie_vma *vma = (struct tu_zombie_vma *) + u_vector_tail(&dev->zombie_vmas); + if (vma->fence > last_signaled_fence) { + VkResult ret = + tu_wait_fence(dev, dev->queues[0]->msm_queue_id, vma->fence, 0); + if (ret != VK_SUCCESS) + break; + + last_signaled_fence = vma->fence; + } + + set_iova(dev, vma->res_id, 0); + + u_vector_remove(&dev->zombie_vmas); + + struct tu_zombie_vma *vma2 = (struct tu_zombie_vma *) + u_vector_add(&vdev->zombie_vmas_stage_2); + + *vma2 = *vma; + } + + /* And _then_ close the GEM handles: */ + while (u_vector_length(&vdev->zombie_vmas_stage_2) > 0) { + struct tu_zombie_vma *vma = (struct tu_zombie_vma *) + u_vector_remove(&vdev->zombie_vmas_stage_2); + + util_vma_heap_free(&dev->vma, vma->iova, vma->size); + tu_gem_close(dev, vma->gem_handle); + } + + return VK_SUCCESS; +} + +static VkResult +virtio_allocate_userspace_iova(struct tu_device *dev, + uint64_t size, + uint64_t client_iova, + enum tu_bo_alloc_flags flags, + uint64_t *iova) +{ + VkResult result; + + mtx_lock(&dev->vma_mutex); + + *iova = 0; + + tu_free_zombie_vma_locked(dev, false); + + result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova); + if (result == VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS) { + /* Address may be already freed by us, but not considered as + * freed by the kernel. We have to wait until all work that + * may hold the address is done. Since addresses are meant to + * be replayed only by debug tooling, it should be ok to wait. + */ + tu_free_zombie_vma_locked(dev, true); + result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova); + } + + mtx_unlock(&dev->vma_mutex); + + return result; +} + +static VkResult +tu_bo_init(struct tu_device *dev, + struct tu_bo *bo, + uint32_t gem_handle, + uint64_t size, + uint64_t iova, + enum tu_bo_alloc_flags flags, + const char *name) +{ + assert(dev->physical_device->has_set_iova); + + set_iova(dev, bo->res_id, iova); + + name = tu_debug_bos_add(dev, size, name); + + mtx_lock(&dev->bo_mutex); + uint32_t idx = dev->bo_count++; + + /* grow the bo list if needed */ + if (idx >= dev->bo_list_size) { + uint32_t new_len = idx + 64; + struct drm_msm_gem_submit_bo *new_ptr = (struct drm_msm_gem_submit_bo *) + vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!new_ptr) { + dev->bo_count--; + mtx_unlock(&dev->bo_mutex); + tu_gem_close(dev, gem_handle); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + dev->bo_list = new_ptr; + dev->bo_list_size = new_len; + } + + bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP; + dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) { + .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE | + COND(dump, MSM_SUBMIT_BO_DUMP), + .handle = bo->res_id, + .presumed = iova, + }; + + *bo = (struct tu_bo) { + .gem_handle = gem_handle, + .res_id = bo->res_id, + .size = size, + .iova = iova, + .name = name, + .refcnt = 1, + .bo_list_idx = idx, + }; + + mtx_unlock(&dev->bo_mutex); + + return VK_SUCCESS; +} + +/** + * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more + * useful. + * + * We skip this on release builds (when we're also not doing BO debugging) to + * reduce overhead. + */ +static void +tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name) +{ + bool kernel_bo_names = dev->bo_sizes != NULL; +#ifdef DEBUG + kernel_bo_names = true; +#endif + if (!kernel_bo_names) + return; + + size_t sz = strlen(name); + + unsigned req_len = sizeof(struct msm_ccmd_gem_set_name_req) + align(sz, 4); + + uint8_t buf[req_len]; + struct msm_ccmd_gem_set_name_req *req = (struct msm_ccmd_gem_set_name_req *)buf; + + req->hdr = MSM_CCMD(GEM_SET_NAME, req_len); + req->res_id = bo->res_id; + req->len = sz; + + memcpy(req->payload, name, sz); + + send_ccmd(dev, &req->hdr, false); +} + +static VkResult +virtio_bo_init(struct tu_device *dev, + struct tu_bo **out_bo, + uint64_t size, + uint64_t client_iova, + VkMemoryPropertyFlags mem_property, + enum tu_bo_alloc_flags flags, + const char *name) +{ + struct tu_virtio_device *vdev = dev->vdev; + struct drm_virtgpu_resource_create_blob args = { + .blob_mem = VIRTGPU_BLOB_MEM_HOST3D, + .size = size, + }; + struct msm_ccmd_gem_new_req req = { + .hdr = MSM_CCMD(GEM_NEW, sizeof(req)), + .size = size, + }; + VkResult result; + + result = virtio_allocate_userspace_iova(dev, size, client_iova, + flags, &req.iova); + if (result != VK_SUCCESS) { + return result; + } + + if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) { + if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) { + req.flags |= MSM_BO_CACHED_COHERENT; + } else { + req.flags |= MSM_BO_CACHED; + } + } else { + req.flags |= MSM_BO_WC; + } + + if (mem_property & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + args.blob_flags |= VIRTGPU_BLOB_FLAG_USE_MAPPABLE; + } + + if (!(mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) { + args.blob_flags |= VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE | + VIRTGPU_BLOB_FLAG_USE_SHAREABLE; + } + + if (flags & TU_BO_ALLOC_GPU_READ_ONLY) + req.flags |= MSM_BO_GPU_READONLY; + + args.blob_id = p_atomic_inc_return(&vdev->next_blob_id); + args.cmd = (uint64_t)(intptr_t)(&req); + args.cmd_size = sizeof(req); + + /* tunneled cmds are processed separately on host side, + * before the renderer->get_blob() callback.. the blob_id + * is used to link the created bo to the get_blob() call + */ + req.blob_id = args.blob_id; + + /* RESOURCE_CREATE_BLOB has a "back-door" EXECBUF in terms of + * the tunneled cmds. To preserve the ordering, we need to + * flush any buffered ccmds and do the ioctl with eb_lock + * held. + */ + simple_mtx_lock(&vdev->eb_lock); + execbuf_flush_locked(dev, NULL); + req.hdr.seqno = ++vdev->next_seqno; + int ret = virtio_ioctl(dev->fd, VIRTGPU_RESOURCE_CREATE_BLOB, &args); + simple_mtx_unlock(&vdev->eb_lock); + + if (ret) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + struct tu_bo* bo = tu_device_lookup_bo(dev, args.res_handle); + assert(bo && bo->gem_handle == 0); + + bo->res_id = args.res_handle; + + result = tu_bo_init(dev, bo, args.bo_handle, size, req.iova, flags, name); + if (result != VK_SUCCESS) + memset(bo, 0, sizeof(*bo)); + else + *out_bo = bo; + + /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */ + tu_bo_set_kernel_name(dev, bo, name); + + if (result == VK_SUCCESS && + (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) && + !(mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + tu_bo_map(dev, bo); + + /* Cached non-coherent memory may already have dirty cache lines, + * we should clean the cache lines before GPU got the chance to + * write into this memory. + * + * MSM already does this automatically for uncached (MSM_BO_WC) memory. + */ + tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU); + } + + return result; +} + +static VkResult +virtio_bo_init_dmabuf(struct tu_device *dev, + struct tu_bo **out_bo, + uint64_t size, + int prime_fd) +{ + VkResult result; + struct tu_bo* bo = NULL; + + /* lseek() to get the real size */ + off_t real_size = lseek(prime_fd, 0, SEEK_END); + lseek(prime_fd, 0, SEEK_SET); + if (real_size < 0 || (uint64_t) real_size < size) + return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + + /* iova allocation needs to consider the object's *real* size: */ + size = real_size; + + uint64_t iova; + result = virtio_allocate_userspace_iova(dev, size, 0, TU_BO_ALLOC_NO_FLAGS, &iova); + if (result != VK_SUCCESS) + return result; + + /* Importing the same dmabuf several times would yield the same + * gem_handle. Thus there could be a race when destroying + * BO and importing the same dmabuf from different threads. + * We must not permit the creation of dmabuf BO and its release + * to happen in parallel. + */ + u_rwlock_wrlock(&dev->dma_bo_lock); + + struct drm_virtgpu_resource_info args = {}; + int ret = drmPrimeFDToHandle(dev->fd, prime_fd, + &args.bo_handle); + if (ret) { + result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto out_unlock; + } + + ret = virtio_ioctl(dev->fd, VIRTGPU_RESOURCE_INFO, &args); + if (ret) { + result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto out_unlock; + } + + bo = tu_device_lookup_bo(dev, args.res_handle); + + if (bo->refcnt != 0) { + p_atomic_inc(&bo->refcnt); + *out_bo = bo; + result = VK_SUCCESS; + goto out_unlock; + } + + bo->res_id = args.res_handle; + + result = tu_bo_init(dev, bo, args.bo_handle, size, iova, + TU_BO_ALLOC_NO_FLAGS, "dmabuf"); + if (result != VK_SUCCESS) + memset(bo, 0, sizeof(*bo)); + else + *out_bo = bo; + +out_unlock: + u_rwlock_wrunlock(&dev->dma_bo_lock); + if (result != VK_SUCCESS) { + mtx_lock(&dev->vma_mutex); + util_vma_heap_free(&dev->vma, iova, size); + mtx_unlock(&dev->vma_mutex); + } + + return result; +} + +static VkResult +virtio_bo_map_handle(struct tu_device *dev, uint32_t handle, + uint32_t size, void **map) +{ + struct drm_virtgpu_map req = { + .handle = handle, + }; + int ret; + + ret = virtio_ioctl(dev->fd, VIRTGPU_MAP, &req); + if (ret) + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + + *map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, dev->fd, req.offset); + if (*map == MAP_FAILED) + return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED); + + return VK_SUCCESS; +} + +static VkResult +virtio_bo_map(struct tu_device *dev, struct tu_bo *bo) +{ + if (bo->map) + return VK_SUCCESS; + + return virtio_bo_map_handle(dev, bo->gem_handle, bo->size, &bo->map); +} + +static void +virtio_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo) +{ + mtx_lock(&dev->bo_mutex); + dev->bo_list[bo->bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP; + mtx_unlock(&dev->bo_mutex); +} + +static VkResult +tu_queue_submit_create_locked(struct tu_queue *queue, + struct vk_queue_submit *vk_submit, + const uint32_t nr_in_syncobjs, + const uint32_t nr_out_syncobjs, + uint32_t perf_pass_index, + struct tu_queue_submit *new_submit) +{ + VkResult result; + + bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context); + bool has_trace_points = false; + + struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers; + + memset(new_submit, 0, sizeof(struct tu_queue_submit)); + + new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers; + new_submit->nr_cmd_buffers = vk_submit->command_buffer_count; + tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers, + &new_submit->nr_cmd_buffers); + + uint32_t entry_count = 0; + for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) { + struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j]; + + if (perf_pass_index != ~0) + entry_count++; + + entry_count += cmdbuf->cs.entry_count; + + if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) { + if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) + entry_count++; + + has_trace_points = true; + } + } + + new_submit->autotune_fence = + tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers); + if (new_submit->autotune_fence) + entry_count++; + + /* Add one for the userspace fence cmd: */ + entry_count += 1; + + new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc( + &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (new_submit->cmds == NULL) { + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_cmds; + } + + if (has_trace_points) { + result = + tu_u_trace_submission_data_create( + queue->device, new_submit->cmd_buffers, + new_submit->nr_cmd_buffers, + &new_submit->u_trace_submission_data); + + if (result != VK_SUCCESS) { + goto fail_u_trace_submission_data; + } + } + + /* Allocate without wait timeline semaphores */ + new_submit->in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( + &queue->device->vk.alloc, + nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (new_submit->in_syncobjs == NULL) { + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_in_syncobjs; + } + + /* Allocate with signal timeline semaphores considered */ + new_submit->out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc( + &queue->device->vk.alloc, + nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (new_submit->out_syncobjs == NULL) { + result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail_out_syncobjs; + } + + new_submit->entry_count = entry_count; + new_submit->nr_in_syncobjs = nr_in_syncobjs; + new_submit->nr_out_syncobjs = nr_out_syncobjs; + new_submit->perf_pass_index = perf_pass_index; + new_submit->vk_submit = vk_submit; + + return VK_SUCCESS; + +fail_out_syncobjs: + vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs); +fail_in_syncobjs: + if (new_submit->u_trace_submission_data) + tu_u_trace_submission_data_finish(queue->device, + new_submit->u_trace_submission_data); +fail_u_trace_submission_data: + vk_free(&queue->device->vk.alloc, new_submit->cmds); +fail_cmds: + return result; +} + +static void +tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit) +{ + vk_free(&queue->device->vk.alloc, submit->cmds); + vk_free(&queue->device->vk.alloc, submit->in_syncobjs); + vk_free(&queue->device->vk.alloc, submit->out_syncobjs); + if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers) + vk_free(&queue->device->vk.alloc, submit->cmd_buffers); +} + +static void +tu_fill_msm_gem_submit(struct tu_device *dev, + struct drm_msm_gem_submit_cmd *cmd, + struct tu_cs_entry *cs_entry) +{ + cmd->type = MSM_SUBMIT_CMD_BUF; + cmd->submit_idx = cs_entry->bo->bo_list_idx; + cmd->submit_offset = cs_entry->offset; + cmd->size = cs_entry->size; + cmd->pad = 0; + cmd->nr_relocs = 0; + cmd->relocs = 0; +} + +static void +tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue, + struct tu_queue_submit *submit, + struct tu_cs *autotune_cs) +{ + struct tu_device *dev = queue->device; + struct tu_virtio_device *vdev = dev->vdev; + struct drm_msm_gem_submit_cmd *cmds = submit->cmds; + + uint32_t entry_idx = 0; + for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) { + struct tu_device *dev = queue->device; + struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j]; + struct tu_cs *cs = &cmdbuf->cs; + + if (submit->perf_pass_index != ~0) { + struct tu_cs_entry *perf_cs_entry = + &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index]; + + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry); + entry_idx++; + } + + for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) { + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]); + } + + if (submit->u_trace_submission_data) { + struct tu_cs *ts_cs = + submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs; + if (ts_cs) { + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]); + entry_idx++; + } + } + } + + if (autotune_cs) { + assert(autotune_cs->entry_count == 1); + tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]); + entry_idx++; + } + + /* Last, add the userspace fence cmd: */ + struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds; + uint32_t fence = ++queue->fence; + int idx = fence % ARRAY_SIZE(fcmds->cmds); + + /* Wait for previous usage of fence cmd to be idle.. in practice the table + * of recycled cmds should be big enough to never stall here: + */ + tu_wait_fence(dev, dev->queues[0]->msm_queue_id, fcmds->cmds[idx].fence, 3000000000); + + fcmds->cmds[idx].fence = fence; + + cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF; + cmds[entry_idx].submit_idx = vdev->fence_cmds_mem->bo_list_idx; + cmds[entry_idx].submit_offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds; + cmds[entry_idx].size = 5 * 4; + cmds[entry_idx].pad = 0; + cmds[entry_idx].nr_relocs = 0; + cmds[entry_idx].relocs = 0; +} + +static VkResult +setup_fence_cmds(struct tu_device *dev) +{ + struct tu_virtio_device *vdev = dev->vdev; + VkResult result; + + result = tu_bo_init_new(dev, &vdev->fence_cmds_mem, sizeof(*vdev->fence_cmds), + (enum tu_bo_alloc_flags) + (TU_BO_ALLOC_ALLOW_DUMP | TU_BO_ALLOC_GPU_READ_ONLY), + "fence_cmds"); + if (result != VK_SUCCESS) + return result; + + result = tu_bo_map(dev, vdev->fence_cmds_mem); + if (result != VK_SUCCESS) + return result; + + vdev->fence_cmds = (struct tu_userspace_fence_cmds *)vdev->fence_cmds_mem->map; + + uint64_t fence_iova = dev->global_bo->iova + gb_offset(userspace_fence); + for (int i = 0; i < ARRAY_SIZE(vdev->fence_cmds->cmds); i++) { + struct tu_userspace_fence_cmd *c = &vdev->fence_cmds->cmds[i]; + + memset(c, 0, sizeof(*c)); + + c->pkt[0] = pm4_pkt7_hdr((uint8_t)CP_EVENT_WRITE, 4); + c->pkt[1] = CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS); + c->pkt[2] = fence_iova; + c->pkt[3] = fence_iova >> 32; + } + + return result; +} + +static VkResult +tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) +{ + struct tu_virtio_device *vdev = queue->device->vdev; + + queue->device->submit_count++; + + /* It would be nice to not need to defer this, but virtio_device_init() + * happens before the device is initialized enough to allocate normal + * GEM buffers + */ + if (!vdev->fence_cmds) { + VkResult result = setup_fence_cmds(queue->device); + if (result != VK_SUCCESS) + return result; + } + + struct tu_cs *autotune_cs = NULL; + if (submit->autotune_fence) { + autotune_cs = tu_autotune_on_submit(queue->device, + &queue->device->autotune, + submit->cmd_buffers, + submit->nr_cmd_buffers); + } + + uint32_t flags = MSM_PIPE_3D0; + + if (submit->vk_submit->wait_count) + flags |= MSM_SUBMIT_SYNCOBJ_IN; + + if (submit->vk_submit->signal_count) + flags |= MSM_SUBMIT_SYNCOBJ_OUT; + + mtx_lock(&queue->device->bo_mutex); + + if (queue->device->implicit_sync_bo_count == 0) + flags |= MSM_SUBMIT_NO_IMPLICIT; + + /* drm_msm_gem_submit_cmd requires index of bo which could change at any + * time when bo_mutex is not locked. So we build submit cmds here the real + * place to submit. + */ + tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs); + + /* TODO avoid extra memcpy, and populate bo's and cmds directly + * into the req msg + */ + unsigned nr_cmds = submit->entry_count; + unsigned nr_bos = nr_cmds ? queue->device->bo_count : 0; + unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo); + unsigned cmd_len = nr_cmds * sizeof(struct drm_msm_gem_submit_cmd); + unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len; + struct msm_ccmd_gem_submit_req *req = (struct msm_ccmd_gem_submit_req *)vk_alloc( + &queue->device->vk.alloc, req_len, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (!req) { + mtx_unlock(&queue->device->bo_mutex); + return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY); + } + + req->hdr = MSM_CCMD(GEM_SUBMIT, req_len); + req->flags = flags; + req->queue_id = queue->msm_queue_id; + req->nr_bos = nr_bos; + req->nr_cmds = nr_cmds; + + /* Use same kernel fence and userspace fence seqno to avoid having + * to track both: + */ + req->fence = queue->fence; + + memcpy(req->payload, queue->device->bo_list, bos_len); + memcpy(req->payload + bos_len, submit->cmds, cmd_len); + + int ring_idx = queue->priority + 1; + int ret; + + simple_mtx_lock(&vdev->eb_lock); + ret = execbuf_flush_locked(queue->device, NULL) || + execbuf_locked(queue->device, &req->hdr, req->hdr.len, + submit, NULL, ring_idx); + simple_mtx_unlock(&vdev->eb_lock); + + mtx_unlock(&queue->device->bo_mutex); + + tu_debug_bos_print_stats(queue->device); + + if (ret) + return vk_device_set_lost(&queue->device->vk, "submit failed: %m"); + +#if HAVE_PERFETTO + tu_perfetto_submit(queue->device, queue->device->submit_count); +#endif + + if (submit->u_trace_submission_data) { + struct tu_u_trace_submission_data *submission_data = + submit->u_trace_submission_data; + submission_data->submission_id = queue->device->submit_count; + /* We have to allocate it here since it is different between drm/kgsl */ + submission_data->syncobj = (struct tu_u_trace_syncobj *) + vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + submission_data->syncobj->fence = req->fence; + submission_data->syncobj->msm_queue_id = queue->msm_queue_id; + + submit->u_trace_submission_data = NULL; + + for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) { + bool free_data = i == submission_data->last_buffer_with_tracepoints; + if (submission_data->cmd_trace_data[i].trace) + u_trace_flush(submission_data->cmd_trace_data[i].trace, + submission_data, free_data); + + if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) { + /* u_trace is owned by cmd_buffer */ + submission_data->cmd_trace_data[i].trace = NULL; + } + } + } + + for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) { + if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync)) + continue; + + struct tu_timeline_sync *sync = + container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base); + + assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET); + + /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj + * is done and ready again so this can be garbage-collectioned later. + */ + sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED; + } + + for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) { + if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync)) + continue; + + struct tu_timeline_sync *sync = + container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base); + + assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET); + /* Set SUBMITTED to the state of the signal timeline sync so we could wait for + * this timeline sync until completed if necessary. + */ + sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED; + } + + pthread_cond_broadcast(&queue->device->timeline_cond); + + return VK_SUCCESS; +} + +static VkResult +virtio_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj) +{ + return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000); +} + +static VkResult +virtio_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit) +{ + MESA_TRACE_FUNC(); + uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ? + submit->perf_pass_index : ~0; + struct tu_queue_submit submit_req; + + if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) { + tu_dbg_log_gmem_load_store_skips(queue->device); + } + + pthread_mutex_lock(&queue->device->submit_mutex); + + VkResult ret = tu_queue_submit_create_locked(queue, submit, + submit->wait_count, submit->signal_count, + perf_pass_index, &submit_req); + + if (ret != VK_SUCCESS) { + pthread_mutex_unlock(&queue->device->submit_mutex); + return ret; + } + + /* note: assuming there won't be any very large semaphore counts */ + struct drm_virtgpu_execbuffer_syncobj *in_syncobjs = submit_req.in_syncobjs; + struct drm_virtgpu_execbuffer_syncobj *out_syncobjs = submit_req.out_syncobjs; + + uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0; + + for (uint32_t i = 0; i < submit->wait_count; i++) { + struct vk_sync *sync = submit->waits[i].sync; + + in_syncobjs[nr_in_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + }; + } + + for (uint32_t i = 0; i < submit->signal_count; i++) { + struct vk_sync *sync = submit->signals[i].sync; + + out_syncobjs[nr_out_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) { + .handle = tu_syncobj_from_vk_sync(sync), + .flags = 0, + }; + } + + ret = tu_queue_submit_locked(queue, &submit_req); + + pthread_mutex_unlock(&queue->device->submit_mutex); + tu_queue_submit_finish(queue, &submit_req); + + if (ret != VK_SUCCESS) + return ret; + + u_trace_context_process(&queue->device->trace_context, true); + + return VK_SUCCESS; +} + +static const struct tu_knl virtio_knl_funcs = { + .name = "virtgpu", + + .device_init = virtio_device_init, + .device_finish = virtio_device_finish, + .device_get_gpu_timestamp = virtio_device_get_gpu_timestamp, + .device_get_suspend_count = virtio_device_get_suspend_count, + .device_check_status = virtio_device_check_status, + .submitqueue_new = virtio_submitqueue_new, + .submitqueue_close = virtio_submitqueue_close, + .bo_init = virtio_bo_init, + .bo_init_dmabuf = virtio_bo_init_dmabuf, + .bo_export_dmabuf = tu_drm_export_dmabuf, + .bo_map = virtio_bo_map, + .bo_allow_dump = virtio_bo_allow_dump, + .bo_finish = tu_drm_bo_finish, + .device_wait_u_trace = virtio_device_wait_u_trace, + .queue_submit = virtio_queue_submit, +}; + +VkResult +tu_knl_drm_virtio_load(struct tu_instance *instance, + int fd, struct _drmVersion *version, + struct tu_physical_device **out) +{ + struct virgl_renderer_capset_drm caps; + VkResult result = VK_SUCCESS; + uint64_t val; + + /* Debug option to force fallback to venus: */ + if (debug_get_bool_option("TU_NO_VIRTIO", false)) + return VK_ERROR_INCOMPATIBLE_DRIVER; + + if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &val) || !val) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "kernel driver for device %s does not support DRM_CAP_SYNC_OBJ", + version->name); + } + + if (get_capset(fd, &caps)) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "could not get caps: %s", strerror(errno)); + } + + if (caps.context_type != VIRTGPU_DRM_CONTEXT_MSM) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "wrong context_type: %u", caps.context_type); + } + + mesa_logd("wire_format_version: %u", caps.wire_format_version); + mesa_logd("version_major: %u", caps.version_major); + mesa_logd("version_minor: %u", caps.version_minor); + mesa_logd("version_patchlevel: %u", caps.version_patchlevel); + mesa_logd("has_cached_coherent: %u", caps.u.msm.has_cached_coherent); + mesa_logd("va_start: 0x%0" PRIx64, caps.u.msm.va_start); + mesa_logd("va_size: 0x%0" PRIx64, caps.u.msm.va_size); + mesa_logd("gpu_id: %u", caps.u.msm.gpu_id); + mesa_logd("gmem_size: %u", caps.u.msm.gmem_size); + mesa_logd("gmem_base: 0x%0" PRIx64, caps.u.msm.gmem_base); + mesa_logd("chip_id: 0x%0" PRIx64, caps.u.msm.chip_id); + mesa_logd("max_freq: %u", caps.u.msm.max_freq); + + if (caps.wire_format_version != 2) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "Unsupported protocol version: %u", + caps.wire_format_version); + } + + if ((caps.version_major != 1) || (caps.version_minor < 9)) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "unsupported version: %u.%u.%u", + caps.version_major, + caps.version_minor, + caps.version_patchlevel); + } + + if (!caps.u.msm.va_size) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "No address space"); + } + + if (set_context(fd)) { + return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, + "Could not set context type: %s", strerror(errno)); + } + + struct tu_physical_device *device = (struct tu_physical_device *) + vk_zalloc(&instance->vk.alloc, sizeof(*device), 8, + VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); + if (!device) { + result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY); + goto fail; + } + + device->msm_major_version = caps.version_major; + device->msm_minor_version = caps.version_minor; + + device->instance = instance; + device->local_fd = fd; + + device->dev_id.gpu_id = caps.u.msm.gpu_id; + device->dev_id.chip_id = caps.u.msm.chip_id; + device->gmem_size = caps.u.msm.gmem_size; + device->gmem_base = caps.u.msm.gmem_base; + device->va_start = caps.u.msm.va_start; + device->va_size = caps.u.msm.va_size; + device->has_set_iova = true; + + device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size); + + device->has_cached_coherent_memory = caps.u.msm.has_cached_coherent; +#ifdef _SC_LEVEL1_DCACHE_LINESIZE + if (DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64) { + long l1_dcache = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + device->has_cached_non_coherent_memory = l1_dcache > 0; + device->level1_dcache_size = l1_dcache; + } +#endif + + device->submitqueue_priority_count = caps.u.msm.priorities; + + device->syncobj_type = vk_drm_syncobj_get_type(fd); + /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */ + if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) + device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type); + + device->sync_types[0] = &device->syncobj_type; + device->sync_types[1] = &device->timeline_type.sync; + device->sync_types[2] = NULL; + + device->heap.size = tu_get_system_heap_size(); + device->heap.used = 0u; + device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT; + + instance->knl = &virtio_knl_funcs; + + *out = device; + + return VK_SUCCESS; + +fail: + vk_free(&instance->vk.alloc, device); + return result; +}