From 8d0e6c02c7024fd3a1e8f60bde8dd4428943bd46 Mon Sep 17 00:00:00 2001
From: Friedrich Vock <friedrich.vock@gmx.de>
Date: Sat, 16 Jul 2022 23:52:45 +0200
Subject: [PATCH] radv: Add RMV tracing utilities

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17331>
---
 src/amd/vulkan/meson.build    |   1 +
 src/amd/vulkan/radv_private.h |  54 ++
 src/amd/vulkan/radv_rmv.c     | 939 ++++++++++++++++++++++++++++++++++
 3 files changed, 994 insertions(+)
 create mode 100644 src/amd/vulkan/radv_rmv.c

diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index 59a8562102d..5e10f831424 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -95,6 +95,7 @@ libradv_files = files(
   'radv_pipeline_rt.c',
   'radv_private.h',
   'radv_radeon_winsys.h',
+  'radv_rmv.c',
   'radv_rra.c',
   'radv_rt_common.c',
   'radv_rt_shader.c',
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 40a2cda8d42..e03c65e4c64 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -68,6 +68,9 @@
 #include "vk_util.h"
 #include "vk_image.h"
 
+#include "rmv/vk_rmv_common.h"
+#include "rmv/vk_rmv_tokens.h"
+
 #include "ac_binary.h"
 #include "ac_gpu_info.h"
 #include "ac_shader_util.h"
@@ -802,6 +805,14 @@ struct radv_notifier {
    thrd_t thread;
 };
 
+struct radv_memory_trace_data {
+   /* ID of the PTE update event in ftrace data */
+   uint16_t ftrace_update_ptes_id;
+
+   uint32_t num_cpus;
+   int *pipe_fds;
+};
+
 struct radv_rra_accel_struct_data {
    VkEvent build_event;
    uint64_t va;
@@ -927,6 +938,9 @@ struct radv_device {
    /* Thread trace. */
    struct ac_thread_trace_data thread_trace;
 
+   /* Memory trace. */
+   struct radv_memory_trace_data memory_trace;
+
    /* SPM. */
    struct ac_spm_trace_data spm_trace;
 
@@ -2830,6 +2844,46 @@ void radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data)
 bool radv_sdma_copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
                           struct radv_buffer *buffer, const VkBufferImageCopy2 *region);
 
+void radv_memory_trace_init(struct radv_device *device);
+void radv_rmv_log_bo_allocate(struct radv_device *device, struct radeon_winsys_bo *bo,
+                              uint32_t size, bool is_internal);
+void radv_rmv_log_bo_destroy(struct radv_device *device, struct radeon_winsys_bo *bo);
+void radv_rmv_log_heap_create(struct radv_device *device, VkDeviceMemory heap, bool is_internal,
+                              VkMemoryAllocateFlags alloc_flags);
+void radv_rmv_log_buffer_bind(struct radv_device *device, VkBuffer _buffer);
+void radv_rmv_log_image_create(struct radv_device *device, const VkImageCreateInfo *create_info,
+                               bool is_internal, VkImage _image);
+void radv_rmv_log_image_bind(struct radv_device *device, VkImage _image);
+void radv_rmv_log_query_pool_create(struct radv_device *device, VkQueryPool pool, bool is_internal);
+void radv_rmv_log_command_buffer_bo_create(struct radv_device *device, struct radeon_winsys_bo *bo,
+                                           uint32_t executable_size, uint32_t data_size,
+                                           uint32_t scratch_size);
+void radv_rmv_log_command_buffer_bo_destroy(struct radv_device *device,
+                                            struct radeon_winsys_bo *bo);
+void radv_rmv_log_border_color_palette_create(struct radv_device *device,
+                                              struct radeon_winsys_bo *bo);
+void radv_rmv_log_border_color_palette_destroy(struct radv_device *device,
+                                               struct radeon_winsys_bo *bo);
+void radv_rmv_log_sparse_add_residency(struct radv_device *device, struct radeon_winsys_bo *src_bo,
+                                       uint64_t offset);
+void radv_rmv_log_sparse_remove_residency(struct radv_device *device,
+                                          struct radeon_winsys_bo *src_bo, uint64_t offset);
+void radv_rmv_log_descriptor_pool_create(struct radv_device *device,
+                                         const VkDescriptorPoolCreateInfo *create_info,
+                                         VkDescriptorPool pool, bool is_internal);
+void radv_rmv_log_graphics_pipeline_create(struct radv_device *device, VkPipelineCreateFlags flags,
+                                           struct radv_pipeline *pipeline, bool is_internal);
+void radv_rmv_log_compute_pipeline_create(struct radv_device *device, VkPipelineCreateFlags flags,
+                                          struct radv_pipeline *pipeline, bool is_internal);
+void radv_rmv_log_event_create(struct radv_device *device, VkEvent event, VkEventCreateFlags flags,
+                               bool is_internal);
+void radv_rmv_log_resource_destroy(struct radv_device *device, uint64_t handle);
+void radv_rmv_log_submit(struct radv_device *device, enum amd_ip_type type);
+void radv_rmv_fill_device_info(struct radv_physical_device *device,
+                               struct vk_rmv_device_info *info);
+void radv_rmv_collect_trace_events(struct radv_device *device);
+void radv_memory_trace_finish(struct radv_device *device);
+
 VkResult radv_create_buffer(struct radv_device *device, const VkBufferCreateInfo *pCreateInfo,
                             const VkAllocationCallbacks *pAllocator, VkBuffer *pBuffer,
                             bool is_internal);
diff --git a/src/amd/vulkan/radv_rmv.c b/src/amd/vulkan/radv_rmv.c
new file mode 100644
index 00000000000..bee9038be0c
--- /dev/null
+++ b/src/amd/vulkan/radv_rmv.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright © 2022 Friedrich Vock
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _WIN32
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "ac_gpu_info.h"
+#include "radv_private.h"
+
+#define RADV_FTRACE_INSTANCE_PATH "/sys/kernel/tracing/instances/amd_rmv"
+
+static FILE *
+open_event_file(const char *event_name, const char *event_filename, const char *mode)
+{
+   char filename[2048];
+   snprintf(filename, sizeof(filename), RADV_FTRACE_INSTANCE_PATH "/events/amdgpu/%s/%s",
+            event_name, event_filename);
+   return fopen(filename, mode);
+}
+
+static bool
+set_event_tracing_enabled(const char *event_name, bool enabled)
+{
+   FILE *file = open_event_file(event_name, "enable", "w");
+   if (!file)
+      return false;
+
+   size_t written_bytes = fwrite("1", 1, 1, file);
+   fclose(file);
+   return written_bytes == 1;
+}
+
+static uint16_t
+trace_event_id(const char *event_name)
+{
+   /* id is 16-bit, so <= 65535 */
+   char data[6];
+
+   FILE *file = open_event_file(event_name, "id", "r");
+   if (!file)
+      return (uint16_t)~0;
+
+   size_t read_bytes = fread(data, 1, 6, file);
+   fclose(file);
+
+   if (!read_bytes)
+      return (uint16_t)~0;
+
+   return (uint16_t)strtoul(data, NULL, 10);
+}
+
+static void
+open_trace_pipe(uint32_t cpu_index, int *dst_fd)
+{
+#ifdef _WIN32
+   *dst_fd = -1;
+#else
+   char filename[2048];
+   snprintf(filename, sizeof(filename), RADV_FTRACE_INSTANCE_PATH "/per_cpu/cpu%d/trace_pipe_raw",
+            cpu_index);
+   /* I/O to the pipe needs to be non-blocking, otherwise reading all available
+    * data would block indefinitely by waiting for more data to be written to the pipe */
+   *dst_fd = open(filename, O_RDONLY | O_NONBLOCK);
+#endif
+}
+
+/*
+ * Kernel trace buffer parsing
+ */
+
+struct trace_page_header {
+   uint64_t timestamp;
+   int32_t commit;
+};
+
+enum trace_event_type {
+   TRACE_EVENT_TYPE_PADDING = 29,
+   TRACE_EVENT_TYPE_EXTENDED_DELTA,
+   TRACE_EVENT_TYPE_TIMESTAMP
+};
+
+struct trace_event_header {
+   uint32_t type_len : 5;
+   uint32_t time_delta : 27;
+   /* Only present if length is too big for type_len */
+   uint32_t excess_length;
+};
+
+struct trace_event_common {
+   unsigned short type;
+   unsigned char flags;
+   unsigned char preempt_count;
+   int pid;
+};
+
+struct trace_event_amdgpu_vm_update_ptes {
+   struct trace_event_common common;
+   uint64_t start;
+   uint64_t end;
+   uint64_t flags;
+   unsigned int num_ptes;
+   uint64_t incr;
+   int pid;
+   uint64_t vm_ctx;
+};
+
+/* Represents a dynamic array of addresses in the ftrace buffer. */
+struct trace_event_address_array {
+   uint16_t data_size;
+   uint16_t reserved;
+   char data[];
+};
+
+/* Possible flags for PTEs, taken from amdgpu_vm.h */
+#define AMDGPU_PTE_VALID  (1ULL << 0)
+#define AMDGPU_PTE_SYSTEM (1ULL << 1)
+#define AMDGPU_PTE_PRT    (1ULL << 51)
+
+/* The minimum size of a GPU page */
+#define MIN_GPU_PAGE_SIZE 4096
+
+static void
+emit_page_table_update_event(struct vk_memory_trace_data *data, bool is_apu, uint64_t timestamp,
+                             struct trace_event_amdgpu_vm_update_ptes *event, uint64_t *addrs,
+                             unsigned int pte_index)
+{
+   struct vk_rmv_token token;
+
+   uint64_t end_addr;
+   /* There may be more updated PTEs than the ones reported in the ftrace buffer.
+    * We choose the reported end virtual address here to report the correct total committed memory. */
+   if (pte_index == event->num_ptes - 1)
+      end_addr = event->end;
+   else
+      end_addr = event->start + (pte_index + 1) * (event->incr / MIN_GPU_PAGE_SIZE);
+   uint64_t start_addr = event->start + pte_index * (event->incr / MIN_GPU_PAGE_SIZE);
+
+   token.type = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE;
+   token.timestamp = timestamp;
+   token.data.page_table_update.type = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE;
+   token.data.page_table_update.page_size = event->incr;
+   token.data.page_table_update.page_count =
+      (end_addr - start_addr) * MIN_GPU_PAGE_SIZE / event->incr;
+   token.data.page_table_update.pid = event->common.pid;
+   token.data.page_table_update.virtual_address =
+      event->start * MIN_GPU_PAGE_SIZE + pte_index * event->incr;
+   /* RMV expects mappings to system memory to have a physical address of 0.
+    * Even with traces generated by AMDGPU-PRO, on APUs without dedicated VRAM everything seems to
+    * be marked as "committed to system memory". */
+   token.data.page_table_update.physical_address =
+      event->flags & AMDGPU_PTE_SYSTEM || is_apu ? 0 : addrs[pte_index];
+
+   token.data.page_table_update.is_unmap = !(event->flags & (AMDGPU_PTE_VALID | AMDGPU_PTE_PRT));
+   util_dynarray_append(&data->tokens, struct vk_rmv_token, token);
+}
+
+static void
+evaluate_trace_event(struct radv_device *device, uint64_t timestamp, struct util_dynarray *tokens,
+                     struct trace_event_amdgpu_vm_update_ptes *event)
+{
+   if (event->common.pid != getpid()) {
+      return;
+   }
+
+   struct trace_event_address_array *array = (struct trace_event_address_array *)(event + 1);
+
+   for (uint32_t i = 0; i < event->num_ptes; ++i)
+      emit_page_table_update_event(&device->vk.memory_trace_data,
+                                   !device->physical_device->rad_info.has_dedicated_vram, timestamp,
+                                   event, (uint64_t *)array->data, i);
+}
+
+static void
+append_trace_events(struct radv_device *device, int pipe_fd)
+{
+   /* Assuming 4KB if os_get_page_size fails. */
+   uint64_t page_size = 4096;
+   os_get_page_size(&page_size);
+
+   uint64_t timestamp;
+
+   /*
+    * Parse the trace ring buffer page by page.
+    */
+   char *page = (char *)malloc(page_size);
+   if (!page) {
+      return;
+   }
+   int64_t read_bytes;
+   do {
+      read_bytes = (int64_t)read(pipe_fd, page, page_size);
+      if (read_bytes < (int64_t)sizeof(struct trace_page_header))
+         break;
+
+      struct trace_page_header *page_header = (struct trace_page_header *)page;
+      timestamp = page_header->timestamp;
+
+      size_t data_size = MIN2((size_t)read_bytes, (size_t)page_header->commit);
+
+      char *read_ptr = page + sizeof(struct trace_page_header);
+      while (read_ptr - page < data_size) {
+         struct trace_event_header *event_header = (struct trace_event_header *)read_ptr;
+         read_ptr += sizeof(struct trace_event_header);
+
+         /* Handle special event type, see include/linux/ring_buffer.h in the
+          * kernel source */
+         switch (event_header->type_len) {
+         case TRACE_EVENT_TYPE_PADDING:
+            if (event_header->time_delta) {
+               /* Specified size, skip past padding */
+               read_ptr += event_header->excess_length;
+               timestamp += event_header->time_delta;
+               continue;
+            } else {
+               /* Padding is until end of page, skip until next page */
+               read_ptr = page + data_size;
+               continue;
+            }
+         case TRACE_EVENT_TYPE_EXTENDED_DELTA:
+            timestamp += event_header->time_delta;
+            timestamp += event_header->excess_length << 28;
+            continue;
+         case TRACE_EVENT_TYPE_TIMESTAMP:
+            timestamp = event_header->time_delta;
+            timestamp |= event_header->excess_length << 28;
+            continue;
+         default:
+            break;
+         }
+
+         timestamp += event_header->time_delta;
+
+         /* If type_len is not one of the special types and not zero, it is
+          * the data length / 4. */
+         size_t length;
+         struct trace_event_common *event;
+         if (event_header->type_len) {
+            length = event_header->type_len * 4 + 4;
+            /* The length variable already contains event data in this case.
+             */
+            event = (struct trace_event_common *)&event_header->excess_length;
+         } else {
+            length = event_header->excess_length + 4;
+            event = (struct trace_event_common *)read_ptr;
+         }
+
+         if (event->type == device->memory_trace.ftrace_update_ptes_id)
+            evaluate_trace_event(device, timestamp, &device->vk.memory_trace_data.tokens,
+                                 (struct trace_event_amdgpu_vm_update_ptes *)event);
+
+         read_ptr += length - sizeof(struct trace_event_header);
+      }
+   } while (true);
+}
+
+static void
+close_pipe_fds(struct radv_device *device)
+{
+   for (uint32_t i = 0; i < device->memory_trace.num_cpus; ++i) {
+      close(device->memory_trace.pipe_fds[i]);
+   }
+}
+
+void
+radv_memory_trace_init(struct radv_device *device)
+{
+#ifndef _WIN32
+   DIR *dir = opendir(RADV_FTRACE_INSTANCE_PATH);
+   if (!dir) {
+      fprintf(stderr,
+              "radv: Couldn't initialize memory tracing: "
+              "Can't access the tracing instance directory (%s)\n",
+              strerror(errno));
+      goto error;
+   }
+   closedir(dir);
+
+   device->memory_trace.num_cpus = 0;
+
+   char cpuinfo_line[1024];
+   FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r");
+   uint32_t num_physical_cores;
+   while (fgets(cpuinfo_line, sizeof(cpuinfo_line), cpuinfo_file)) {
+      char *logical_core_string = strstr(cpuinfo_line, "siblings");
+      if (logical_core_string)
+         sscanf(logical_core_string, "siblings : %d", &device->memory_trace.num_cpus);
+      char *physical_core_string = strstr(cpuinfo_line, "cpu cores");
+      if (physical_core_string)
+         sscanf(physical_core_string, "cpu cores : %d", &num_physical_cores);
+   }
+   if (!device->memory_trace.num_cpus)
+      device->memory_trace.num_cpus = num_physical_cores;
+   fclose(cpuinfo_file);
+
+   FILE *clock_file = fopen(RADV_FTRACE_INSTANCE_PATH "/trace_clock", "w");
+   if (!clock_file) {
+      fprintf(stderr,
+              "radv: Couldn't initialize memory tracing: "
+              "Can't access the tracing control files (%s).\n",
+              strerror(errno));
+      goto error;
+   }
+
+   fprintf(clock_file, "mono");
+   fclose(clock_file);
+
+   device->memory_trace.pipe_fds = malloc(device->memory_trace.num_cpus * sizeof(int));
+
+   if (!device->memory_trace.pipe_fds) {
+      device->memory_trace.num_cpus = 0;
+   }
+   for (uint32_t i = 0; i < device->memory_trace.num_cpus; ++i) {
+      open_trace_pipe(i, device->memory_trace.pipe_fds + i);
+
+      if (device->memory_trace.pipe_fds[i] == -1) {
+         fprintf(stderr,
+                 "radv: Couldn't initialize memory tracing: "
+                 "Can't access the trace buffer pipes (%s).\n",
+                 strerror(errno));
+         for (i -= 1; i < device->memory_trace.num_cpus; --i) {
+            close(device->memory_trace.pipe_fds[i]);
+         }
+         goto error;
+      }
+   }
+
+   device->memory_trace.ftrace_update_ptes_id = trace_event_id("amdgpu_vm_update_ptes");
+   if (device->memory_trace.ftrace_update_ptes_id == (uint16_t)~0U) {
+      fprintf(stderr,
+              "radv: Couldn't initialize memory tracing: "
+              "Can't access the trace event ID file (%s).\n",
+              strerror(errno));
+      goto error_pipes;
+   }
+
+   if (!set_event_tracing_enabled("amdgpu_vm_update_ptes", true)) {
+      fprintf(stderr,
+              "radv: Couldn't initialize memory tracing: "
+              "Can't enable trace events (%s).\n",
+              strerror(errno));
+      goto error_pipes;
+   }
+
+   fprintf(stderr, "radv: Enabled Memory Trace.\n");
+   return;
+
+error_pipes:
+   close_pipe_fds(device);
+error:
+   vk_memory_trace_finish(&device->vk);
+#endif
+}
+
+static void
+fill_memory_info(struct radeon_info *info, struct vk_rmv_memory_info *out_info, int32_t index)
+{
+   switch (index) {
+   case VK_RMV_MEMORY_LOCATION_DEVICE:
+      out_info->physical_base_address = 0;
+      out_info->size = info->all_vram_visible ? (uint64_t)info->vram_size_kb * 1024ULL
+                                              : (uint64_t)info->vram_vis_size_kb * 1024ULL;
+      break;
+   case VK_RMV_MEMORY_LOCATION_DEVICE_INVISIBLE:
+      out_info->physical_base_address = (uint64_t)info->vram_vis_size_kb * 1024ULL;
+      out_info->size = info->all_vram_visible ? 0 : (uint64_t)info->vram_size_kb * 1024ULL;
+      break;
+   case VK_RMV_MEMORY_LOCATION_HOST: {
+      uint64_t ram_size = -1U;
+      os_get_total_physical_memory(&ram_size);
+      out_info->physical_base_address = 0;
+      out_info->size = MIN2((uint64_t)info->gart_size_kb * 1024ULL, ram_size);
+   } break;
+   default:
+      unreachable("invalid memory index");
+   }
+}
+
+static enum vk_rmv_memory_type
+memory_type_from_vram_type(uint32_t vram_type)
+{
+   switch (vram_type) {
+   case AMD_VRAM_TYPE_UNKNOWN:
+      return VK_RMV_MEMORY_TYPE_UNKNOWN;
+   case AMD_VRAM_TYPE_DDR2:
+      return VK_RMV_MEMORY_TYPE_DDR2;
+   case AMD_VRAM_TYPE_DDR3:
+      return VK_RMV_MEMORY_TYPE_DDR3;
+   case AMD_VRAM_TYPE_DDR4:
+      return VK_RMV_MEMORY_TYPE_DDR4;
+   case AMD_VRAM_TYPE_GDDR5:
+      return VK_RMV_MEMORY_TYPE_GDDR5;
+   case AMD_VRAM_TYPE_HBM:
+      return VK_RMV_MEMORY_TYPE_HBM;
+   case AMD_VRAM_TYPE_GDDR6:
+      return VK_RMV_MEMORY_TYPE_GDDR6;
+   case AMD_VRAM_TYPE_DDR5:
+      return VK_RMV_MEMORY_TYPE_LPDDR5;
+   default:
+      unreachable("Invalid vram type");
+   }
+}
+
+void
+radv_rmv_fill_device_info(struct radv_physical_device *device, struct vk_rmv_device_info *info)
+{
+   struct radeon_info *rad_info = &device->rad_info;
+
+   for (int32_t i = 0; i < VK_RMV_MEMORY_LOCATION_COUNT; ++i) {
+      fill_memory_info(rad_info, &info->memory_infos[i], i);
+   }
+
+   if (rad_info->marketing_name)
+      strncpy(info->device_name, rad_info->marketing_name, sizeof(info->device_name) - 1);
+   info->pcie_family_id = rad_info->family_id;
+   info->pcie_revision_id = rad_info->pci_rev_id;
+   info->pcie_device_id = rad_info->pci_dev;
+   info->minimum_shader_clock = 0;
+   info->maximum_shader_clock = rad_info->max_gpu_freq_mhz;
+   info->vram_type = memory_type_from_vram_type(rad_info->vram_type);
+   info->vram_bus_width = rad_info->memory_bus_width;
+   info->vram_operations_per_clock = ac_memory_ops_per_clock(rad_info->vram_type);
+   info->minimum_memory_clock = 0;
+   info->maximum_memory_clock = rad_info->memory_freq_mhz;
+   info->vram_bandwidth = rad_info->memory_bandwidth_gbps;
+}
+
+void
+radv_rmv_collect_trace_events(struct radv_device *device)
+{
+   for (uint32_t i = 0; i < device->memory_trace.num_cpus; ++i) {
+      append_trace_events(device, device->memory_trace.pipe_fds[i]);
+   }
+}
+
+void
+radv_memory_trace_finish(struct radv_device *device)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   set_event_tracing_enabled("amdgpu_vm_update_ptes", false);
+   close_pipe_fds(device);
+}
+
+/* The token lock must be held when entering _locked functions */
+static void
+log_resource_bind_locked(struct radv_device *device, uint64_t resource, struct radeon_winsys_bo *bo,
+                         uint64_t offset, uint64_t size)
+{
+   struct vk_rmv_resource_bind_token token = {0};
+   token.address = bo->va + offset;
+   token.is_system_memory = bo->initial_domain & RADEON_DOMAIN_GTT;
+   token.size = size;
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, resource);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &token);
+}
+
+void
+radv_rmv_log_heap_create(struct radv_device *device, VkDeviceMemory heap, bool is_internal,
+                         VkMemoryAllocateFlags alloc_flags)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_device_memory, memory, heap);
+
+   radv_rmv_log_bo_allocate(device, memory->bo, memory->alloc_size, false);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+
+   struct vk_rmv_resource_create_token token = {0};
+   token.is_driver_internal = is_internal;
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)heap);
+   token.type = VK_RMV_RESOURCE_TYPE_HEAP;
+   token.heap.alignment = device->physical_device->rad_info.max_alignment;
+   token.heap.size = memory->alloc_size;
+   token.heap.heap_index = memory->heap_index;
+   token.heap.alloc_flags = alloc_flags;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+   log_resource_bind_locked(device, (uint64_t)heap, memory->bo, 0, memory->alloc_size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_bo_allocate(struct radv_device *device, struct radeon_winsys_bo *bo, uint32_t size,
+                         bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   struct vk_rmv_virtual_allocate_token token = {0};
+   token.address = bo->va;
+   /* If all VRAM is visible, no bo will be in invisible memory. */
+   token.is_in_invisible_vram =
+      bo->vram_no_cpu_access && !device->physical_device->rad_info.all_vram_visible;
+   token.preferred_domains = (enum vk_rmv_kernel_memory_domain)bo->initial_domain;
+   token.is_driver_internal = is_internal;
+   token.page_count = DIV_ROUND_UP(size, 4096);
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_ALLOCATE, &token);
+   radv_rmv_collect_trace_events(device);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_bo_destroy(struct radv_device *device, struct radeon_winsys_bo *bo)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   struct vk_rmv_virtual_free_token token = {0};
+   token.address = bo->va;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+   radv_rmv_collect_trace_events(device);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_buffer_bind(struct radv_device *device, VkBuffer _buffer)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   log_resource_bind_locked(device, (uint64_t)_buffer, buffer->bo, buffer->offset, buffer->vk.size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_image_create(struct radv_device *device, const VkImageCreateInfo *create_info,
+                          bool is_internal, VkImage _image)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_image, image, _image);
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token token = {0};
+   token.is_driver_internal = is_internal;
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_image);
+   token.type = VK_RMV_RESOURCE_TYPE_IMAGE;
+   token.image.create_flags = create_info->flags;
+   token.image.usage_flags = create_info->usage;
+   token.image.type = create_info->imageType;
+   token.image.extent = create_info->extent;
+   token.image.format = create_info->format;
+   token.image.num_mips = create_info->mipLevels;
+   token.image.num_slices = create_info->arrayLayers;
+   token.image.tiling = create_info->tiling;
+   token.image.alignment_log2 = util_logbase2(image->alignment);
+   token.image.log2_samples = util_logbase2(image->info.samples);
+   token.image.log2_storage_samples = util_logbase2(image->info.storage_samples);
+   token.image.metadata_alignment_log2 = image->planes[0].surface.meta_alignment_log2;
+   token.image.image_alignment_log2 = image->planes[0].surface.alignment_log2;
+   token.image.size = image->size;
+   token.image.metadata_size = image->planes[0].surface.meta_size;
+   token.image.metadata_header_size = 0;
+   token.image.metadata_offset = image->planes[0].surface.meta_offset;
+   token.image.metadata_header_offset = image->planes[0].surface.meta_offset;
+   token.image.presentable = image->planes[0].surface.is_displayable;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_image_bind(struct radv_device *device, VkImage _image)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_image, image, _image);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   log_resource_bind_locked(device, (uint64_t)_image, image->bindings[0].bo,
+                            image->bindings[0].offset, image->size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_query_pool_create(struct radv_device *device, VkQueryPool _pool, bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
+
+   if (pool->type != VK_QUERY_TYPE_OCCLUSION && pool->type != VK_QUERY_TYPE_PIPELINE_STATISTICS &&
+       pool->type != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT)
+      return;
+
+   radv_rmv_log_bo_allocate(device, pool->bo, pool->size, is_internal);
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = is_internal;
+   create_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_pool);
+   create_token.type = VK_RMV_RESOURCE_TYPE_QUERY_HEAP;
+   create_token.query_pool.type = pool->type;
+   create_token.query_pool.has_cpu_access = true;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   log_resource_bind_locked(device, (uint64_t)_pool, pool->bo, 0, pool->size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_command_buffer_bo_create(struct radv_device *device, struct radeon_winsys_bo *bo,
+                                      uint32_t executable_size, uint32_t data_size,
+                                      uint32_t scratch_size)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   /* Only one of executable_size, data_size and scratch_size should be > 0 */
+   /* TODO: Trace CS BOs for executable data */
+   uint32_t size = MAX3(executable_size, data_size, scratch_size);
+
+   radv_rmv_log_bo_allocate(device, bo, size, true);
+
+   uint64_t upload_resource_identifier = (uint64_t)(uintptr_t)bo;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = true;
+   create_token.resource_id =
+      vk_rmv_get_resource_id_locked(&device->vk, upload_resource_identifier);
+   create_token.type = VK_RMV_RESOURCE_TYPE_COMMAND_ALLOCATOR;
+   create_token.command_buffer.preferred_domain =
+      (enum vk_rmv_kernel_memory_domain)device->ws->cs_domain(device->ws);
+   create_token.command_buffer.executable_size = executable_size;
+   create_token.command_buffer.app_available_executable_size = executable_size;
+   create_token.command_buffer.embedded_data_size = data_size;
+   create_token.command_buffer.app_available_embedded_data_size = data_size;
+   create_token.command_buffer.scratch_size = scratch_size;
+   create_token.command_buffer.app_available_scratch_size = scratch_size;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   log_resource_bind_locked(device, upload_resource_identifier, bo, 0, size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_log_cpu_map(&device->vk, bo->va, false);
+}
+
+void
+radv_rmv_log_command_buffer_bo_destroy(struct radv_device *device, struct radeon_winsys_bo *bo)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_destroy_token destroy_token = {0};
+   destroy_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)bo);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY,
+                     &destroy_token);
+   vk_rmv_destroy_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)bo);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   radv_rmv_log_bo_destroy(device, bo);
+   vk_rmv_log_cpu_map(&device->vk, bo->va, true);
+}
+
+void
+radv_rmv_log_border_color_palette_create(struct radv_device *device, struct radeon_winsys_bo *bo)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   radv_rmv_log_bo_allocate(device, bo, RADV_BORDER_COLOR_BUFFER_SIZE, true);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   uint32_t resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)bo);
+
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = true;
+   create_token.resource_id = resource_id;
+   create_token.type = VK_RMV_RESOURCE_TYPE_BORDER_COLOR_PALETTE;
+   /*
+    * We have 4096 entries, but the corresponding RMV token only has 8 bits.
+    */
+   create_token.border_color_palette.num_entries = 255; /* = RADV_BORDER_COLOR_COUNT; */
+
+   struct vk_rmv_resource_bind_token bind_token;
+   bind_token.address = bo->va;
+   bind_token.is_system_memory = false;
+   bind_token.resource_id = resource_id;
+   bind_token.size = RADV_BORDER_COLOR_BUFFER_SIZE;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &bind_token);
+   vk_rmv_log_cpu_map(&device->vk, bo->va, false);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_border_color_palette_destroy(struct radv_device *device, struct radeon_winsys_bo *bo)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_destroy_token token = {0};
+   /* same resource id as the create token */
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)bo);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_log_cpu_map(&device->vk, bo->va, true);
+}
+
+void
+radv_rmv_log_sparse_add_residency(struct radv_device *device, struct radeon_winsys_bo *src_bo,
+                                  uint64_t offset)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   struct vk_rmv_resource_reference_token token = {0};
+   token.virtual_address = src_bo->va + offset;
+   token.residency_removed = false;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+   radv_rmv_collect_trace_events(device);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_sparse_remove_residency(struct radv_device *device, struct radeon_winsys_bo *src_bo,
+                                     uint64_t offset)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   struct vk_rmv_resource_reference_token token = {0};
+   token.virtual_address = src_bo->va + offset;
+   token.residency_removed = true;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+   radv_rmv_collect_trace_events(device);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_descriptor_pool_create(struct radv_device *device,
+                                    const VkDescriptorPoolCreateInfo *create_info,
+                                    VkDescriptorPool _pool, bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_descriptor_pool, pool, _pool);
+
+   if (pool->bo) {
+      radv_rmv_log_bo_allocate(device, pool->bo, pool->size, is_internal);
+      vk_rmv_log_cpu_map(&device->vk, pool->bo->va, false);
+   }
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = false;
+   create_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_pool);
+   create_token.type = VK_RMV_RESOURCE_TYPE_DESCRIPTOR_POOL;
+   create_token.descriptor_pool.max_sets = create_info->maxSets;
+   create_token.descriptor_pool.pool_size_count = create_info->poolSizeCount;
+   /* Using vk_rmv_token_pool_alloc frees the allocation automatically when the trace is done. */
+   create_token.descriptor_pool.pool_sizes =
+      malloc(create_info->poolSizeCount * sizeof(VkDescriptorPoolSize));
+   if (!create_token.descriptor_pool.pool_sizes)
+      return;
+
+   memcpy(create_token.descriptor_pool.pool_sizes, create_info->pPoolSizes,
+          create_info->poolSizeCount * sizeof(VkDescriptorPoolSize));
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+   if (pool->bo) {
+      simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+      struct vk_rmv_resource_bind_token bind_token;
+      bind_token.address = pool->bo->va;
+      bind_token.is_system_memory = false;
+      bind_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_pool);
+      bind_token.size = pool->size;
+
+      vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND,
+                        &bind_token);
+      simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   }
+}
+
+void
+radv_rmv_log_graphics_pipeline_create(struct radv_device *device, VkPipelineCreateFlags flags,
+                                      struct radv_pipeline *pipeline, bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   VkPipeline _pipeline = radv_pipeline_to_handle(pipeline);
+   struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = is_internal;
+   create_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_pipeline);
+   create_token.type = VK_RMV_RESOURCE_TYPE_PIPELINE;
+   create_token.pipeline.is_internal = is_internal;
+   create_token.pipeline.hash_lo = pipeline->pipeline_hash;
+   create_token.pipeline.is_ngg = graphics_pipeline->is_ngg;
+   create_token.pipeline.shader_stages = graphics_pipeline->active_stages;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   log_resource_bind_locked(device, (uint64_t)_pipeline, pipeline->slab_bo,
+                            pipeline->slab->alloc->offset, pipeline->slab->alloc->size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_compute_pipeline_create(struct radv_device *device, VkPipelineCreateFlags flags,
+                                     struct radv_pipeline *pipeline, bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   VkPipeline _pipeline = radv_pipeline_to_handle(pipeline);
+
+   VkShaderStageFlagBits active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = is_internal;
+   create_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_pipeline);
+   create_token.type = VK_RMV_RESOURCE_TYPE_PIPELINE;
+   create_token.pipeline.is_internal = is_internal;
+   create_token.pipeline.hash_lo = pipeline->pipeline_hash;
+   create_token.pipeline.is_ngg = false;
+   create_token.pipeline.shader_stages = active_stages;
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   log_resource_bind_locked(device, (uint64_t)_pipeline, pipeline->slab_bo,
+                            pipeline->slab->alloc->offset, pipeline->slab->alloc->size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_event_create(struct radv_device *device, VkEvent _event, VkEventCreateFlags flags,
+                          bool is_internal)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   RADV_FROM_HANDLE(radv_event, event, _event);
+
+   radv_rmv_log_bo_allocate(device, event->bo, 8, is_internal);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {0};
+   create_token.is_driver_internal = is_internal;
+   create_token.type = VK_RMV_RESOURCE_TYPE_GPU_EVENT;
+   create_token.event.flags = flags;
+   create_token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)_event);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   if (event->map)
+      vk_rmv_log_cpu_map(&device->vk, event->bo->va, false);
+   log_resource_bind_locked(device, (uint64_t)_event, event->bo, 0, 8);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+radv_rmv_log_submit(struct radv_device *device, enum amd_ip_type type)
+{
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   switch (type) {
+   case AMD_IP_GFX:
+      vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_SUBMIT_GRAPHICS);
+      break;
+   case AMD_IP_COMPUTE:
+      vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_SUBMIT_COMPUTE);
+      break;
+   case AMD_IP_SDMA:
+      vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_SUBMIT_COPY);
+      break;
+   default:
+      unreachable("invalid ip type");
+   }
+}
+
+void
+radv_rmv_log_resource_destroy(struct radv_device *device, uint64_t handle)
+{
+   if (!device->vk.memory_trace_data.is_enabled || handle == 0)
+      return;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_destroy_token token = {0};
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, handle);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &token);
+   vk_rmv_destroy_resource_id_locked(&device->vk, handle);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}