freedreno/drm: Add sub-allocator

Add a heap that we can use for allocations of small mappable buffers. This avoids the churn of mmap/unmap, which is especially expensive in a VM. It also allows packing more smaller allocations together in a page, which is useful for PIPE_BUFFERs (which are also mappable). This avoid jank caused by the overhead of setting up or tearing down guest mappings when running in a VM. And also significantly reduces the # of BOs referenced on a submit. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20263>
2022-12-02 18:42:42 -08:00
parent 7d0d82f25f
commit 4861067689
12 changed files with 491 additions and 26 deletions
--- a/src/freedreno/drm/freedreno_bo.c
+++ b/src/freedreno/drm/freedreno_bo.c
@@ -114,6 +114,13 @@ bo_new(struct fd_device *dev, uint32_t size, uint32_t flags,
 {
   struct fd_bo *bo = NULL;

+   if (size < FD_BO_HEAP_BLOCK_SIZE) {
+      if ((flags == 0) && dev->default_heap)
+         return fd_bo_heap_alloc(dev->default_heap, size);
+      if ((flags == RING_FLAGS) && dev->ring_heap)
+         return fd_bo_heap_alloc(dev->ring_heap, size);
+   }
+
   /* demote cached-coherent to WC if not supported: */
   if ((flags & FD_BO_CACHED_COHERENT) && !dev->has_cached_coherent)
      flags &= ~FD_BO_CACHED_COHERENT;
@@ -278,13 +285,16 @@ bo_del_or_recycle(struct fd_bo *bo)
 {
   struct fd_device *dev = bo->dev;

-   if ((bo->bo_reuse == BO_CACHE) &&
-       (fd_bo_cache_free(&dev->bo_cache, bo) == 0))
-      return 0;
+   /* No point in BO cache for suballocated buffers: */
+   if (!suballoc_bo(bo)) {
+      if ((bo->bo_reuse == BO_CACHE) &&
+          (fd_bo_cache_free(&dev->bo_cache, bo) == 0))
+         return 0;

-   if ((bo->bo_reuse == RING_CACHE) &&
-       (fd_bo_cache_free(&dev->ring_cache, bo) == 0))
-      return 0;
+      if ((bo->bo_reuse == RING_CACHE) &&
+          (fd_bo_cache_free(&dev->ring_cache, bo) == 0))
+         return 0;
+   }

   return bo_del(bo);
 }
@@ -355,6 +365,16 @@ fd_bo_del_list_nocache(struct list_head *list)
   close_handles(dev, handles, cnt);
 }

+void
+fd_bo_fini_fences(struct fd_bo *bo)
+{
+   for (int i = 0; i < bo->nr_fences; i++)
+      fd_fence_del(bo->fences[i]);
+
+   if (bo->fences != &bo->_inline_fence)
+      free(bo->fences);
+}
+
 /**
 * Helper called by backends bo->funcs->destroy()
 *
@@ -371,11 +391,7 @@ fd_bo_fini_common(struct fd_bo *bo)

   VG_BO_FREE(bo);

-   for (int i = 0; i < bo->nr_fences; i++)
-      fd_fence_del(bo->fences[i]);
-
-   if (bo->fences != &bo->_inline_fence)
-      free(bo->fences);
+   fd_bo_fini_fences(bo);

   if (bo->map)
      os_munmap(bo->map, bo->size);
--- a/src/freedreno/drm/freedreno_bo_heap.c
+++ b/src/freedreno/drm/freedreno_bo_heap.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright © 2022 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "freedreno_drmif.h"
+#include "freedreno_priv.h"
+
+struct sa_bo {
+   struct fd_bo base;
+   struct fd_bo_heap *heap;
+   unsigned offset;
+};
+FD_DEFINE_CAST(fd_bo, sa_bo);
+
+#define HEAP_DEBUG 0
+
+static void heap_clean(struct fd_bo_heap *heap, bool idle);
+static void heap_dump(struct fd_bo_heap *heap);
+
+struct fd_bo_heap *
+fd_bo_heap_new(struct fd_device *dev, uint32_t flags)
+{
+   struct fd_bo_heap *heap;
+
+   /* We cannot suballocate shared buffers! Implicit sync is not supported! */
+   assert(!(flags & FD_BO_SHARED));
+
+   /* No internal buffers either, we need userspace fencing: */
+   assert(!(flags & _FD_BO_NOSYNC));
+
+   heap = calloc(1, sizeof(*heap));
+
+   heap->dev = dev;
+   heap->flags = flags;
+   simple_mtx_init(&heap->lock, mtx_plain);
+   list_inithead(&heap->freelist);
+
+   /* Note that util_vma_heap_init doesn't like offset==0, so we shift the
+    * entire range by one block size (see block_idx()):
+    */
+   util_vma_heap_init(&heap->heap, FD_BO_HEAP_BLOCK_SIZE,
+                      FD_BO_HEAP_BLOCK_SIZE * ARRAY_SIZE(heap->blocks));
+   heap->heap.alloc_high = false;
+   heap->heap.nospan_shift = ffs(FD_BO_HEAP_BLOCK_SIZE) - 1;
+
+   heap_dump(heap);
+
+   return heap;
+}
+
+void fd_bo_heap_destroy(struct fd_bo_heap *heap)
+{
+   /* drain the freelist: */
+   heap_clean(heap, false);
+
+   util_vma_heap_finish(&heap->heap);
+   for (unsigned i = 0; i < ARRAY_SIZE(heap->blocks); i++)
+      if (heap->blocks[i])
+         fd_bo_del(heap->blocks[i]);
+   free(heap);
+}
+
+static bool
+sa_idle(struct fd_bo *bo)
+{
+   enum fd_bo_state state = fd_bo_state(bo);
+   assert(state != FD_BO_STATE_UNKNOWN);
+   return state == FD_BO_STATE_IDLE;
+}
+
+/**
+ * The backing block is determined by the offset within the heap, since all
+ * the blocks are equal size
+ */
+static unsigned
+block_idx(struct sa_bo *s)
+{
+   /* The vma allocator doesn't like offset=0 so the range is shifted up
+    * by one block size:
+    */
+   return (s->offset / FD_BO_HEAP_BLOCK_SIZE) - 1;
+}
+
+static unsigned
+block_offset(struct sa_bo *s)
+{
+   return s->offset % FD_BO_HEAP_BLOCK_SIZE;
+}
+
+static void
+heap_dump(struct fd_bo_heap *heap)
+{
+   if (!HEAP_DEBUG)
+      return;
+   fprintf(stderr, "HEAP[%x]: freelist: %u\n", heap->flags, list_length(&heap->freelist));
+   util_vma_heap_print(&heap->heap, stderr, "",
+                       FD_BO_HEAP_BLOCK_SIZE * ARRAY_SIZE(heap->blocks));
+}
+
+static void
+sa_release(struct fd_bo *bo)
+{
+   struct sa_bo *s = to_sa_bo(bo);
+
+   simple_mtx_assert_locked(&s->heap->lock);
+
+   VG_BO_FREE(bo);
+
+   fd_bo_fini_fences(bo);
+
+   if (HEAP_DEBUG)
+      mesa_logi("release: %08x-%x idx=%d", s->offset, bo->size, block_idx(s));
+
+   util_vma_heap_free(&s->heap->heap, s->offset, bo->size);
+
+   /* Drop our reference to the backing block object: */
+   fd_bo_del(s->heap->blocks[block_idx(s)]);
+
+   list_del(&bo->node);
+
+   if ((++s->heap->cnt % 256) == 0)
+      heap_dump(s->heap);
+
+   free(bo);
+}
+
+static int
+sa_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
+{
+   simple_mtx_lock(&fence_lock);
+   unsigned nr = bo->nr_fences;
+   struct fd_fence *fences[nr];
+   for (unsigned i = 0; i < nr; i++)
+      fences[i] = fd_fence_ref_locked(bo->fences[i]);
+   simple_mtx_unlock(&fence_lock);
+
+   for (unsigned i = 0; i < nr; i++) {
+      fd_fence_wait(fences[i]);
+      fd_fence_del(fences[i]);
+   }
+
+   /* expire completed fences */
+   fd_bo_state(bo);
+
+   assert(fd_bo_state(bo) == FD_BO_STATE_IDLE);
+
+   return 0;
+}
+
+static int
+sa_madvise(struct fd_bo *bo, int willneed)
+{
+   return willneed;
+}
+
+static uint64_t
+sa_iova(struct fd_bo *bo)
+{
+   struct sa_bo *s = to_sa_bo(bo);
+
+   return s->heap->blocks[block_idx(s)]->iova + block_offset(s);
+}
+
+static void
+sa_set_name(struct fd_bo *bo, const char *fmt, va_list ap)
+{
+   /* No-op, kernel has a single name for the entire buffer we suballoc from */
+}
+
+static void
+sa_destroy(struct fd_bo *bo)
+{
+   struct fd_bo_heap *heap = to_sa_bo(bo)->heap;
+
+   simple_mtx_lock(&heap->lock);
+   list_addtail(&bo->node, &heap->freelist);
+   simple_mtx_unlock(&heap->lock);
+}
+
+static struct fd_bo_funcs heap_bo_funcs = {
+      .cpu_prep = sa_cpu_prep,
+      .madvise = sa_madvise,
+      .iova = sa_iova,
+      .set_name = sa_set_name,
+      .destroy = sa_destroy,
+};
+
+/**
+ * Get the backing heap block of a suballocated bo
+ */
+struct fd_bo *
+fd_bo_heap_block(struct fd_bo *bo)
+{
+   assert(suballoc_bo(bo));
+
+   struct sa_bo *s = to_sa_bo(bo);
+   return s->heap->blocks[block_idx(s)];
+}
+
+static void
+heap_clean(struct fd_bo_heap *heap, bool idle)
+{
+   simple_mtx_lock(&heap->lock);
+   foreach_bo_safe (bo, &heap->freelist) {
+      /* It might be nice if we could keep freelist sorted by fence # */
+      if (idle && !sa_idle(bo))
+         continue;
+      sa_release(bo);
+   }
+   simple_mtx_unlock(&heap->lock);
+}
+
+struct fd_bo *
+fd_bo_heap_alloc(struct fd_bo_heap *heap, uint32_t size)
+{
+   heap_clean(heap, true);
+
+   struct sa_bo *s = calloc(1, sizeof(*s));
+
+   s->heap = heap;
+
+   /* util_vma does not like zero byte allocations, which we get, for
+    * ex, with the initial query buffer allocation on pre-a5xx:
+    */
+   size = MAX2(size, SUBALLOC_ALIGNMENT);
+
+   size = ALIGN(size, SUBALLOC_ALIGNMENT);
+
+   simple_mtx_lock(&heap->lock);
+   /* Allocate larger buffers from the bottom, and smaller buffers from top
+    * to help limit fragmentation:
+    *
+    * (The 8k threshold is just a random guess, but seems to work ok)
+    */
+   heap->heap.alloc_high = (size <= 8 * 1024);
+   s->offset = util_vma_heap_alloc(&heap->heap, size, SUBALLOC_ALIGNMENT);
+   assert((s->offset / FD_BO_HEAP_BLOCK_SIZE) == (s->offset + size - 1) / FD_BO_HEAP_BLOCK_SIZE);
+   unsigned idx = block_idx(s);
+   if (HEAP_DEBUG)
+      mesa_logi("alloc: %08x-%x idx=%d", s->offset, size, idx);
+   if (!heap->blocks[idx]) {
+      heap->blocks[idx] = fd_bo_new(
+            heap->dev, FD_BO_HEAP_BLOCK_SIZE, heap->flags,
+            "heap-%x-block-%u", heap->flags, idx);
+   }
+   /* Take a reference to the backing obj: */
+   fd_bo_ref(heap->blocks[idx]);
+   simple_mtx_unlock(&heap->lock);
+
+   struct fd_bo *bo = &s->base;
+
+   bo->size = size;
+   bo->funcs = &heap_bo_funcs;
+   bo->handle = 1; /* dummy handle to make fd_bo_init_common() happy */
+   bo->alloc_flags = heap->flags;
+
+   fd_bo_init_common(bo, heap->dev);
+
+   bo->handle = FD_BO_SUBALLOC_HANDLE;
+
+   /* Pre-initialize mmap ptr, to avoid trying to os_mmap() */
+   bo->map = ((uint8_t *)fd_bo_map(heap->blocks[idx])) + block_offset(s);
+
+   return bo;
+}
--- a/src/freedreno/drm/freedreno_device.c
+++ b/src/freedreno/drm/freedreno_device.c
@@ -43,6 +43,7 @@ fd_device_new(int fd)
 {
   struct fd_device *dev = NULL;
   drmVersionPtr version;
+   bool use_heap = false;

   /* figure out if we are kgsl or msm drm driver: */
   version = drmGetVersion(fd);
@@ -64,6 +65,10 @@ fd_device_new(int fd)
   } else if (!strcmp(version->name, "virtio_gpu")) {
      DEBUG_MSG("virtio_gpu DRM device");
      dev = virtio_device_new(fd, version);
+      /* Only devices that support a hypervisor are a6xx+, so avoid the
+       * extra guest<->host round trips associated with pipe creation:
+       */
+      use_heap = true;
 #endif
 #if HAVE_FREEDRENO_KGSL
   } else if (!strcmp(version->name, "kgsl")) {
@@ -96,6 +101,23 @@ out:
   simple_mtx_init(&dev->submit_lock, mtx_plain);
   simple_mtx_init(&dev->suballoc_lock, mtx_plain);

+   if (!use_heap) {
+      struct fd_pipe *pipe = fd_pipe_new(dev, FD_PIPE_3D);
+
+      /* Userspace fences don't appear to be reliable enough (missing some
+       * cache flushes?) on older gens, so limit sub-alloc heaps to a6xx+
+       * for now:
+       */
+      use_heap = fd_dev_gen(&pipe->dev_id) >= 6;
+
+      fd_pipe_del(pipe);
+   }
+
+   if (use_heap) {
+      dev->ring_heap = fd_bo_heap_new(dev, RING_FLAGS);
+      dev->default_heap = fd_bo_heap_new(dev, 0);
+   }
+
   return dev;
 }

@@ -158,6 +180,12 @@ fd_device_del(struct fd_device *dev)
   if (dev->suballoc_bo)
      fd_bo_del(dev->suballoc_bo);

+   if (dev->ring_heap)
+      fd_bo_heap_destroy(dev->ring_heap);
+
+   if (dev->default_heap)
+      fd_bo_heap_destroy(dev->default_heap);
+
   fd_bo_cache_cleanup(&dev->bo_cache, 0);
   fd_bo_cache_cleanup(&dev->ring_cache, 0);

--- a/src/freedreno/drm/freedreno_drmif.h
+++ b/src/freedreno/drm/freedreno_drmif.h
@@ -130,6 +130,7 @@ struct fd_fence *fd_fence_ref_locked(struct fd_fence *f);
 void fd_fence_del(struct fd_fence *f);
 void fd_fence_del_locked(struct fd_fence *f);
 void fd_fence_flush(struct fd_fence *f);
+int fd_fence_wait(struct fd_fence *f);

 /*
 * bo flags:
--- a/src/freedreno/drm/freedreno_pipe.c
+++ b/src/freedreno/drm/freedreno_pipe.c
@@ -286,3 +286,9 @@ fd_fence_flush(struct fd_fence *f)
   fd_pipe_flush(f->pipe, f->ufence);
   util_queue_fence_wait(&f->ready);
 }
+
+int
+fd_fence_wait(struct fd_fence *f)
+{
+   return fd_pipe_wait(f->pipe, f);
+}
--- a/src/freedreno/drm/freedreno_priv.h
+++ b/src/freedreno/drm/freedreno_priv.h
@@ -46,6 +46,7 @@
 #include "util/u_atomic.h"
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/vma.h"

 #include "freedreno_dev_info.h"
 #include "freedreno_drmif.h"
@@ -126,6 +127,77 @@ struct fd_bo_cache {
   time_t time;
 };

+/* Probably good for the block size to be a multiple of an available
+ * large-page size.  For overlap of what both the MMU (with 4kb granule)
+ * and SMMU support, 2MB is that overlap.  (Well, 4kb is as well, but
+ * too small to be practical ;-))
+ */
+#define FD_BO_HEAP_BLOCK_SIZE (4 * 1024 * 1024)
+
+/* Zero is an invalid handle, use it to indicate buffers that have been sub-
+ * allocated from a larger backing heap block buffer.
+ */
+#define FD_BO_SUBALLOC_HANDLE 0
+
+static inline bool
+suballoc_bo(struct fd_bo *bo)
+{
+   return bo->handle == FD_BO_SUBALLOC_HANDLE;
+}
+
+/**
+ * A heap is a virtual range of memory that is backed by N physical buffers,
+ * from which buffers can be suballocated.  This requires kernel support for
+ * userspace allocated iova.
+ */
+struct fd_bo_heap {
+   struct fd_device *dev;
+
+   int cnt;
+
+   /**
+    * Buffer allocation flags for buffers allocated from this heap.
+    */
+   uint32_t flags;
+
+   simple_mtx_t lock;
+
+   /**
+    * Ranges of the backing buffer are allocated at a granularity of
+    * SUBALLOC_ALIGNMENT
+    */
+   struct util_vma_heap heap;
+
+   /**
+    * List of recently freed suballocated BOs from this allocator until they
+    * become idle.  Backend should periodically call fd_bo_suballoc_clean()
+    * to check for newly idle entries on the freelist, so that the memory can
+    * be returned to the free heap.
+    */
+   struct list_head freelist;
+
+   /**
+    * The backing buffers.  Maximum total heap size is:
+    *   FD_BO_HEAP_BLOCK_SIZE * ARRAY_SIZE(heap->blocks)
+    */
+   struct fd_bo *blocks[256];
+};
+
+struct fd_bo_heap *fd_bo_heap_new(struct fd_device *dev, uint32_t flags);
+void fd_bo_heap_destroy(struct fd_bo_heap *heap);
+
+struct fd_bo *fd_bo_heap_block(struct fd_bo *bo);
+struct fd_bo *fd_bo_heap_alloc(struct fd_bo_heap *heap, uint32_t size);
+
+static inline uint32_t
+submit_offset(struct fd_bo *bo, uint32_t offset)
+{
+   if (suballoc_bo(bo)) {
+      offset += bo->iova - fd_bo_heap_block(bo)->iova;
+   }
+   return offset;
+}
+
 struct fd_device {
   int fd;
   enum fd_version version;
@@ -147,6 +219,16 @@ struct fd_device {
   struct fd_bo_cache bo_cache;
   struct fd_bo_cache ring_cache;

+   /**
+    * Heap for mappable + cached-coherent + gpu-readonly (ie. cmdstream)
+    */
+   struct fd_bo_heap *ring_heap;
+
+   /**
+    * Heap for mappable (ie. majority of small buffer allocations, etc)
+    */
+   struct fd_bo_heap *default_heap;
+
   bool has_cached_coherent;

   bool closefd; /* call close(fd) upon destruction */
@@ -352,6 +434,7 @@ enum fd_bo_state {
 enum fd_bo_state fd_bo_state(struct fd_bo *bo);

 void fd_bo_init_common(struct fd_bo *bo, struct fd_device *dev);
+void fd_bo_fini_fences(struct fd_bo *bo);
 void fd_bo_fini_common(struct fd_bo *bo);

 struct fd_bo *fd_bo_new_ring(struct fd_device *dev, uint32_t size);
--- a/src/freedreno/drm/freedreno_ringbuffer_sp.c
+++ b/src/freedreno/drm/freedreno_ringbuffer_sp.c
@@ -52,17 +52,46 @@ static struct fd_ringbuffer *
 fd_ringbuffer_sp_init(struct fd_ringbuffer_sp *fd_ring, uint32_t size,
                      enum fd_ringbuffer_flags flags);

+
+static void
+append_suballoc_bo(struct fd_submit_sp *submit, struct fd_bo *bo)
+{
+   uint32_t idx = READ_ONCE(bo->idx);
+
+   if (unlikely((idx >= submit->nr_suballoc_bos) ||
+       (submit->suballoc_bos[idx] != bo))) {
+      uint32_t hash = _mesa_hash_pointer(bo);
+      struct hash_entry *entry;
+
+      entry = _mesa_hash_table_search_pre_hashed(
+            submit->suballoc_bo_table, hash, bo);
+      if (entry) {
+         /* found */
+         idx = (uint32_t)(uintptr_t)entry->data;
+      } else {
+         idx = APPEND(submit, suballoc_bos, fd_bo_ref(bo));
+
+         _mesa_hash_table_insert_pre_hashed(
+               submit->suballoc_bo_table, hash, bo, (void *)(uintptr_t)idx);
+      }
+      bo->idx = idx;
+   }
+}
+
 /* add (if needed) bo to submit and return index: */
 uint32_t
 fd_submit_append_bo(struct fd_submit_sp *submit, struct fd_bo *bo)
 {
-   uint32_t idx;
+   if (suballoc_bo(bo)) {
+      append_suballoc_bo(submit, bo);
+      bo = fd_bo_heap_block(bo);
+   }

   /* NOTE: it is legal to use the same bo on different threads for
    * different submits.  But it is not legal to use the same submit
    * from different threads.
    */
-   idx = READ_ONCE(bo->idx);
+   uint32_t idx = READ_ONCE(bo->idx);

   if (unlikely((idx >= submit->nr_bos) || (submit->bos[idx] != bo))) {
      uint32_t hash = _mesa_hash_pointer(bo);
@@ -187,6 +216,9 @@ fd_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd,
      fd_bo_add_fence(fd_submit->bos[i], out_fence);
      has_shared |= fd_submit->bos[i]->alloc_flags & FD_BO_SHARED;
   }
+   for (unsigned i = 0; i < fd_submit->nr_suballoc_bos; i++) {
+      fd_bo_add_fence(fd_submit->suballoc_bos[i], out_fence);
+   }
   simple_mtx_unlock(&fence_lock);

   fd_submit->out_fence   = fd_fence_ref(out_fence);
@@ -385,6 +417,7 @@ fd_submit_sp_destroy(struct fd_submit *submit)
      fd_ringbuffer_del(fd_submit->suballoc_ring);

   _mesa_hash_table_destroy(fd_submit->bo_table, NULL);
+   _mesa_hash_table_destroy(fd_submit->suballoc_bo_table, NULL);

   // TODO it would be nice to have a way to assert() if all
   // rb's haven't been free'd back to the slab, because that is
@@ -392,11 +425,14 @@ fd_submit_sp_destroy(struct fd_submit *submit)
   slab_destroy_child(&fd_submit->ring_pool);

   fd_bo_del_array(fd_submit->bos, fd_submit->nr_bos);
+   free(fd_submit->bos);
+
+   fd_bo_del_array(fd_submit->suballoc_bos, fd_submit->nr_suballoc_bos);
+   free(fd_submit->suballoc_bos);

   if (fd_submit->out_fence)
      fd_fence_del(fd_submit->out_fence);

-   free(fd_submit->bos);
   free(fd_submit);
 }

@@ -412,8 +448,8 @@ fd_submit_sp_new(struct fd_pipe *pipe, flush_submit_list_fn flush_submit_list)
   struct fd_submit_sp *fd_submit = calloc(1, sizeof(*fd_submit));
   struct fd_submit *submit;

-   fd_submit->bo_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
-                                                 _mesa_key_pointer_equal);
+   fd_submit->bo_table = _mesa_pointer_hash_table_create(NULL);
+   fd_submit->suballoc_bo_table = _mesa_pointer_hash_table_create(NULL);

   slab_create_child(&fd_submit->ring_pool, &pipe->ring_pool);

--- a/src/freedreno/drm/freedreno_ringbuffer_sp.h
+++ b/src/freedreno/drm/freedreno_ringbuffer_sp.h
@@ -50,9 +50,19 @@ struct fd_submit_sp {

   DECLARE_ARRAY(struct fd_bo *, bos);

+   /* Keep a separate table of sub-alloc BOs.. the backing objects are
+    * tracked in the main bos table (because this is what the kernel
+    * sees), but we need to attach userspace fences to the sub-alloc'd
+    * BOs so the driver knows when they are idle
+    */
+   DECLARE_ARRAY(struct fd_bo *, suballoc_bos);
+
   /* maps fd_bo to idx in bos table: */
   struct hash_table *bo_table;

+   /* maps fd_bo to idx in suballoc_bos table: */
+   struct hash_table *suballoc_bo_table;
+
   struct slab_child_pool ring_pool;

   /* Allow for sub-allocation of stateobj ring buffers (ie. sharing
--- a/src/freedreno/drm/meson.build
+++ b/src/freedreno/drm/meson.build
@@ -20,6 +20,7 @@

 libfreedreno_drm_files = files(
  'freedreno_bo.c',
+  'freedreno_bo_heap.c',
  'freedreno_bo_cache.c',
  'freedreno_device.c',
  'freedreno_drmif.h',
--- a/src/freedreno/drm/msm/msm_ringbuffer.c
+++ b/src/freedreno/drm/msm/msm_ringbuffer.c
@@ -314,7 +314,7 @@ msm_submit_flush(struct fd_submit *submit, int in_fence_fd, bool use_fence_fd)

         cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
         cmds[i].submit_idx = append_bo(msm_submit, msm_ring->ring_bo);
-         cmds[i].submit_offset = msm_ring->offset;
+         cmds[i].submit_offset = submit_offset(msm_ring->ring_bo, msm_ring->offset);
         cmds[i].size = offset_bytes(ring->cur, ring->start);
         cmds[i].pad = 0;
         cmds[i].nr_relocs = msm_ring->cmd->nr_relocs;
@@ -328,9 +328,9 @@ msm_submit_flush(struct fd_submit *submit, int in_fence_fd, bool use_fence_fd)
            } else {
               cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
            }
-            cmds[i].submit_idx =
-               append_bo(msm_submit, msm_ring->u.cmds[j]->ring_bo);
-            cmds[i].submit_offset = msm_ring->offset;
+            struct fd_bo *ring_bo = msm_ring->u.cmds[j]->ring_bo;
+            cmds[i].submit_idx = append_bo(msm_submit, ring_bo);
+            cmds[i].submit_offset = submit_offset(ring_bo, msm_ring->offset);
            cmds[i].size = msm_ring->u.cmds[j]->size;
            cmds[i].pad = 0;
            cmds[i].nr_relocs = msm_ring->u.cmds[j]->nr_relocs;
--- a/src/freedreno/drm/msm/msm_ringbuffer_sp.c
+++ b/src/freedreno/drm/msm/msm_ringbuffer_sp.c
@@ -67,10 +67,10 @@ flush_submit_list(struct list_head *submit_list)
         to_fd_ringbuffer_sp(submit->primary);

      for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) {
+         struct fd_bo *ring_bo = deferred_primary->u.cmds[i].ring_bo;
         cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[cmd_idx].submit_idx =
-               fd_submit_append_bo(fd_submit, deferred_primary->u.cmds[i].ring_bo);
-         cmds[cmd_idx].submit_offset = deferred_primary->offset;
+         cmds[cmd_idx].submit_idx = fd_submit_append_bo(fd_submit, ring_bo);
+         cmds[cmd_idx].submit_offset = submit_offset(ring_bo, deferred_primary->offset);
         cmds[cmd_idx].size = deferred_primary->u.cmds[i].size;
         cmds[cmd_idx].pad = 0;
         cmds[cmd_idx].nr_relocs = 0;
--- a/src/freedreno/drm/virtio/virtio_ringbuffer.c
+++ b/src/freedreno/drm/virtio/virtio_ringbuffer.c
@@ -85,10 +85,10 @@ flush_submit_list(struct list_head *submit_list)
         to_fd_ringbuffer_sp(submit->primary);

      for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) {
+         struct fd_bo *ring_bo = deferred_primary->u.cmds[i].ring_bo;
         cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF;
-         cmds[cmd_idx].submit_idx =
-               fd_submit_append_bo(fd_submit, deferred_primary->u.cmds[i].ring_bo);
-         cmds[cmd_idx].submit_offset = deferred_primary->offset;
+         cmds[cmd_idx].submit_idx = fd_submit_append_bo(fd_submit, ring_bo);
+         cmds[cmd_idx].submit_offset = submit_offset(ring_bo, deferred_primary->offset);
         cmds[cmd_idx].size = deferred_primary->u.cmds[i].size;
         cmds[cmd_idx].pad = 0;
         cmds[cmd_idx].nr_relocs = 0;