asahi: rewrite queries

1. always keep the query in gpu memory, so we can implement qbos properly. 2. use a lightweight data structure for tracking writers to reduce overhead 3. allow many writers per query to eliminate stalls 4. use context-wide occlusion heap, to satisfy #1 without introducing flushes or silly copies. this is what the pvr mesa driver does :-) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2024-01-21 15:02:10 -04:00
parent ca58bc239a
commit 23b4503225
6 changed files with 285 additions and 263 deletions
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@@ -24,12 +24,6 @@
         agx_msg("[Batch %u] " fmt "\n", agx_batch_idx(batch), ##__VA_ARGS__); \
   } while (0)

-static unsigned
-agx_batch_idx(struct agx_batch *batch)
-{
-   return batch - batch->ctx->batches.slots;
-}
-
 bool
 agx_batch_is_active(struct agx_batch *batch)
 {
@@ -125,9 +119,7 @@ agx_batch_init(struct agx_context *ctx,

   util_dynarray_init(&batch->scissor, ctx);
   util_dynarray_init(&batch->depth_bias, ctx);
-   util_dynarray_init(&batch->occlusion_queries, ctx);
-   util_dynarray_init(&batch->nonocclusion_queries, ctx);
-   util_dynarray_init(&batch->timestamp_queries, ctx);
+   util_dynarray_init(&batch->timestamps, ctx);

   batch->clear = 0;
   batch->draw = 0;
@@ -177,8 +169,6 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
   uint64_t begin_ts = ~0, end_ts = 0;
   /* TODO: UAPI pending */
   agx_finish_batch_queries(batch, begin_ts, end_ts);
-   batch->occlusion_buffer.cpu = NULL;
-   batch->occlusion_buffer.gpu = 0;

   if (reset) {
      int handle;
@@ -212,9 +202,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)

   util_dynarray_fini(&batch->scissor);
   util_dynarray_fini(&batch->depth_bias);
-   util_dynarray_fini(&batch->occlusion_queries);
-   util_dynarray_fini(&batch->nonocclusion_queries);
-   util_dynarray_fini(&batch->timestamp_queries);
+   util_dynarray_fini(&batch->timestamps);

   if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
      agx_batch_print_stats(dev, batch);
@@ -774,13 +762,6 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
   agx_batch_cleanup(ctx, batch, true);
 }

-void
-agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
-{
-   if (q)
-      util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q);
-}
-
 /*
 * Timestamp queries record the time after all current work is finished,
 * which we handle as the time after all current batches finish (since we're a
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1333,19 +1333,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
    */
   agx_batch_add_bo(batch, batch->vdm.bo);

-   /* Occlusion queries are allocated as a contiguous pool */
-   unsigned oq_count =
-      util_dynarray_num_elements(&batch->occlusion_queries, struct agx_query *);
-   size_t oq_size = oq_count * sizeof(uint64_t);
-
-   if (oq_size) {
-      batch->occlusion_buffer =
-         agx_pool_alloc_aligned(&batch->pool, oq_size, 64);
-      memset(batch->occlusion_buffer.cpu, 0, oq_size);
-   } else {
-      batch->occlusion_buffer.gpu = 0;
-   }
-
   if (batch->vs_scratch)
      agx_batch_add_bo(batch, ctx->scratch_vs.buf);
   if (batch->fs_scratch)
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -6,24 +6,16 @@

 #include <stdint.h>
 #include "pipe/p_defines.h"
+#include "util/bitset.h"
 #include "util/macros.h"
+#include "util/ralloc.h"
 #include "util/u_inlines.h"
 #include "util/u_prim.h"
+#include "agx_bo.h"
 #include "agx_device.h"
 #include "agx_state.h"
 #include "pool.h"

-static struct pipe_query *
-agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
-{
-   struct agx_query *query = calloc(1, sizeof(struct agx_query));
-
-   query->type = query_type;
-   query->index = index;
-
-   return (struct pipe_query *)query;
-}
-
 static bool
 is_occlusion(struct agx_query *query)
 {
@@ -49,40 +41,184 @@ is_timer(struct agx_query *query)
   }
 }

+#define AGX_MAX_OCCLUSION_QUERIES (65536)
+
+struct agx_oq_heap {
+   /* The GPU allocation itself */
+   struct agx_bo *bo;
+
+   /* Bitset of query indices that are in use */
+   BITSET_DECLARE(available, AGX_MAX_OCCLUSION_QUERIES);
+};
+
+static void
+agx_destroy_oq_heap(void *heap_)
+{
+   struct agx_oq_heap *heap = heap_;
+   agx_bo_unreference(heap->bo);
+}
+
+static struct agx_oq_heap *
+agx_alloc_oq_heap(struct agx_context *ctx)
+{
+   struct agx_oq_heap *heap = rzalloc(ctx, struct agx_oq_heap);
+   ralloc_set_destructor(heap, agx_destroy_oq_heap);
+
+   heap->bo = agx_bo_create(agx_device(ctx->base.screen),
+                            AGX_MAX_OCCLUSION_QUERIES * sizeof(uint64_t),
+                            AGX_BO_WRITEBACK, "Occlusion query heap");
+
+   /* At the start, everything is available */
+   BITSET_ONES(heap->available);
+
+   return heap;
+}
+
+static struct agx_oq_heap *
+agx_get_oq_heap(struct agx_context *ctx)
+{
+   if (!ctx->oq)
+      ctx->oq = agx_alloc_oq_heap(ctx);
+
+   return ctx->oq;
+}
+
+static struct agx_ptr
+agx_alloc_oq(struct agx_context *ctx)
+{
+   struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
+
+   /* Find first available */
+   int ffs = BITSET_FFS(heap->available);
+   if (!ffs)
+      return (struct agx_ptr){NULL, 0};
+
+   /* Allocate it */
+   unsigned index = ffs - 1;
+   BITSET_CLEAR(heap->available, index);
+
+   unsigned offset = index * sizeof(uint64_t);
+
+   return (struct agx_ptr){
+      (uint8_t *)heap->bo->ptr.cpu + offset,
+      heap->bo->ptr.gpu + offset,
+   };
+}
+
+static unsigned
+agx_oq_index(struct agx_context *ctx, struct agx_query *q)
+{
+   assert(is_occlusion(q));
+
+   return (q->ptr.gpu - ctx->oq->bo->ptr.gpu) / sizeof(uint64_t);
+}
+
+static void
+agx_free_oq(struct agx_context *ctx, struct agx_query *q)
+{
+   struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
+   unsigned index = agx_oq_index(ctx, q);
+
+   assert(index < AGX_MAX_OCCLUSION_QUERIES);
+   assert(!BITSET_TEST(heap->available, index));
+
+   BITSET_SET(heap->available, index);
+}
+
+uint64_t
+agx_get_occlusion_heap(struct agx_batch *batch)
+{
+   if (!batch->ctx->oq)
+      return 0;
+
+   struct agx_bo *bo = batch->ctx->oq->bo;
+
+   if (agx_batch_uses_bo(batch, bo))
+      return bo->ptr.gpu;
+   else
+      return 0;
+}
+
+static struct pipe_query *
+agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+{
+   struct agx_query *query = calloc(1, sizeof(struct agx_query));
+
+   query->type = query_type;
+   query->index = index;
+
+   /* Set all writer generations to a sentinel that will always compare as
+    * false, since nothing writes to no queries.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(query->writer_generation); ++i) {
+      query->writer_generation[i] = UINT64_MAX;
+   }
+
+   if (is_occlusion(query)) {
+      query->ptr = agx_alloc_oq(agx_context(ctx));
+   } else {
+      /* TODO: a BO for the query is wasteful, but we benefit from BO list
+       * tracking / reference counting to deal with lifetimes.
+       */
+      query->bo = agx_bo_create(agx_device(ctx->screen), sizeof(uint64_t) * 2,
+                                AGX_BO_WRITEBACK, "Query");
+      query->ptr = query->bo->ptr;
+   }
+
+   if (!query->ptr.gpu) {
+      free(query);
+      return NULL;
+   }
+
+   return (struct pipe_query *)query;
+}
+
+static void
+sync_query_writers(struct agx_context *ctx, struct agx_query *query,
+                   const char *reason)
+{
+   STATIC_ASSERT(ARRAY_SIZE(ctx->batches.generation) == AGX_MAX_BATCHES);
+   STATIC_ASSERT(ARRAY_SIZE(ctx->batches.slots) == AGX_MAX_BATCHES);
+   STATIC_ASSERT(ARRAY_SIZE(query->writer_generation) == AGX_MAX_BATCHES);
+
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (query->writer_generation[i] == ctx->batches.generation[i])
+         agx_sync_batch_for_reason(ctx, &ctx->batches.slots[i], reason);
+   }
+}
+
+static bool
+is_query_busy(struct agx_context *ctx, struct agx_query *query)
+{
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (query->writer_generation[i] == ctx->batches.generation[i])
+         return true;
+   }
+
+   return false;
+}
+
 static void
 agx_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
 {
   struct agx_context *ctx = agx_context(pctx);
   struct agx_query *query = (struct agx_query *)pquery;

-   /* It is legal for the query to be destroyed before its value is read,
-    * particularly during application teardown. In this case, don't leave a
-    * dangling reference to the query.
+   /* We don't reference count the occlusion query allocations, so we need to
+    * sync writers when destroying so we can freely write from the CPU after
+    * it's destroyed, since the driver will assume an available query is idle.
+    *
+    * For other queries, the BO itself is reference counted after the pipe_query
+    * is destroyed so we don't need to flush.
    */
-   if (query->writer) {
-      assert(!is_timer(query) && "single writer not used here");
-
-      struct agx_batch *writer = query->writer;
-      struct util_dynarray *array = is_occlusion(query)
-                                       ? &writer->occlusion_queries
-                                       : &writer->nonocclusion_queries;
-      struct agx_query **ptr =
-         util_dynarray_element(array, struct agx_query *, query->writer_index);
-
-      assert((*ptr) == query && "data structure invariant");
-      *ptr = NULL;
-   } else if (is_timer(query)) {
-      /* Potentially has many writers! We need them all to synchronize so they
-       * don't have dangling references. Syncing will destroy batches that hold
-       * references as required.
-       *
-       * TODO: Optimize this, timestamp queries are bonkers on tilers.
-       */
-      agx_flush_all(ctx, "Destroying time query");
-      agx_sync_all(ctx, "Destroying time query");
+   if (is_occlusion(query)) {
+      sync_query_writers(ctx, query, "Occlusion query destroy");
+      agx_free_oq(ctx, query);
+   } else {
+      agx_bo_unreference(query->bo);
   }

-   free(query);
+   free(pquery);
 }

 static bool
@@ -118,9 +254,7 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)

   case PIPE_QUERY_TIME_ELAPSED:
      ctx->time_elapsed = query;
-      query->timestamp_begin = UINT64_MAX;
-      query->timestamp_end = 0;
-      return true;
+      break;

   case PIPE_QUERY_TIMESTAMP:
      /* No-op */
@@ -135,17 +269,17 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
      return false;
   }

-   /* begin_query zeroes, flush so we can do that write. If anything (i.e.
-    * other than piglit) actually hits this, we could shadow the query to
-    * avoid the flush.
-    */
-   if (query->writer) {
-      agx_flush_batch_for_reason(ctx, query->writer, "Query overwritten");
-      agx_sync_batch_for_reason(ctx, query->writer, "Query overwrriten");
+   /* begin_query zeroes, sync so we can do that write from the CPU */
+   sync_query_writers(ctx, query, "Query overwritten");
+
+   uint64_t *ptr = query->ptr.cpu;
+   ptr[0] = 0;
+
+   if (query->type == PIPE_QUERY_TIME_ELAPSED) {
+      /* Timestamp begin in second record, the timestamp end in the first */
+      ptr[1] = UINT64_MAX;
   }

-   assert(query->writer == NULL);
-   query->value = 0;
   return true;
 }

@@ -183,15 +317,17 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
      assert(query->index < ARRAY_SIZE(ctx->pipeline_statistics));
      ctx->pipeline_statistics[query->index] = NULL;
      return true;
-   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP: {
      /* Timestamp logically written now, set up batches to MAX their finish
       * time in. If there are no batches, it's just the current time stamp.
       */
      agx_add_timestamp_end_query(ctx, query);

-      query->timestamp_end = agx_get_gpu_timestamp(dev);
+      uint64_t *value = query->ptr.cpu;
+      *value = agx_get_gpu_timestamp(dev);

      return true;
+   }
   default:
      return false;
   }
@@ -205,56 +341,37 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
   struct agx_context *ctx = agx_context(pctx);
   struct agx_device *dev = agx_device(pctx->screen);

-   /* For GPU queries, flush the writer. When the writer is flushed, the GPU
-    * will write the value, and when we wait for the writer, the CPU will read
-    * the value into query->value.
-    */
-   if (query->writer != NULL) {
-      /* Querying the result forces a query to finish in finite time, so we
-       * need to flush. Furthermore, we need all earlier queries
-       * to finish before this query, so we sync unconditionally (so we can
-       * maintain the lie that all queries are finished when read).
-       *
-       * TODO: Optimize based on wait flag.
-       */
-      struct agx_batch *writer = query->writer;
-      agx_flush_batch_for_reason(ctx, writer, "GPU query");
-      agx_sync_batch_for_reason(ctx, writer, "GPU query");
-   } else if (query->type == PIPE_QUERY_TIMESTAMP ||
-              query->type == PIPE_QUERY_TIME_ELAPSED) {
-      /* TODO: Optimize this... timestamp queries are bonkers on tilers. */
-      agx_flush_all(ctx, "Timestamp query");
-      agx_sync_all(ctx, "Timestamp query");
-   }
+   /* TODO: Honour `wait` */
+   sync_query_writers(ctx, query, "Reading query results");

-   /* After syncing, there is no writer left, so query->value is ready */
-   assert(query->writer == NULL && "cleared when cleaning up batch");
+   uint64_t *ptr = query->ptr.cpu;
+   uint64_t value = *ptr;

   switch (query->type) {
   case PIPE_QUERY_OCCLUSION_PREDICATE:
   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      vresult->b = query->value;
+      vresult->b = value;
      return true;

   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-      vresult->b = query->value > 0;
+      vresult->b = value > 0;
      return true;

   case PIPE_QUERY_OCCLUSION_COUNTER:
   case PIPE_QUERY_PRIMITIVES_GENERATED:
   case PIPE_QUERY_PRIMITIVES_EMITTED:
   case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
-      vresult->u64 = query->value;
+      vresult->u64 = value;
      return true;

   case PIPE_QUERY_TIMESTAMP:
-      vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end);
+      vresult->u64 = agx_gpu_time_to_ns(dev, value);
      return true;

   case PIPE_QUERY_TIME_ELAPSED:
-      vresult->u64 =
-         agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin);
+      /* end - begin */
+      vresult->u64 = agx_gpu_time_to_ns(dev, ptr[0] - ptr[1]);
      return true;

   default:
@@ -272,26 +389,25 @@ agx_get_query_result_resource(struct pipe_context *pipe, struct pipe_query *q,

   /* TODO: Don't cheat XXX */
   struct agx_context *ctx = agx_context(pipe);
-   agx_sync_all(ctx, "Stubbed QBOs");

   union pipe_query_result result;
   if (index < 0) {
      /* availability */
-      result.u64 = 1;
+      result.u64 = !is_query_busy(ctx, query);
   } else {
      bool ready = agx_get_query_result(pipe, q, true, &result);
      assert(ready);
-   }

-   switch (query->type) {
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-      result.u64 = result.b;
-      break;
-   default:
-      break;
+      switch (query->type) {
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         result.u64 = result.b;
+         break;
+      default:
+         break;
+      }
   }

   /* Clamp to type, arb_query_buffer_object-qbo tests */
@@ -318,58 +434,36 @@ agx_set_active_query_state(struct pipe_context *pipe, bool enable)
   ctx->dirty |= AGX_DIRTY_QUERY;
 }

-static uint16_t
-agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query,
-                       struct util_dynarray *array)
+static void
+agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query)
 {
-   /* If written by another batch, flush it now. If this affects real apps, we
-    * could avoid this flush by merging query results.
-    */
-   if (query->writer && query->writer != batch) {
-      agx_sync_batch_for_reason(batch->ctx, query->writer,
-                                "Multiple query writers");
+   unsigned idx = agx_batch_idx(batch);
+   struct agx_bo *bo = is_occlusion(query) ? batch->ctx->oq->bo : query->bo;
+
+   agx_batch_add_bo(batch, bo);
+   query->writer_generation[idx] = batch->ctx->batches.generation[idx];
+}
+
+void
+agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
+{
+   if (q) {
+      agx_add_query_to_batch(batch, q);
+      util_dynarray_append(&batch->timestamps, struct agx_ptr, q->ptr);
   }
-
-   /* Allocate if needed */
-   if (query->writer == NULL) {
-      query->writer = batch;
-      query->writer_index =
-         util_dynarray_num_elements(array, struct agx_query *);
-
-      util_dynarray_append(array, struct agx_query *, query);
-   }
-
-   assert(query->writer == batch);
-   assert(*util_dynarray_element(array, struct agx_query *,
-                                 query->writer_index) == query);
-
-   return query->writer_index;
 }

 uint16_t
 agx_get_oq_index(struct agx_batch *batch, struct agx_query *query)
 {
-   assert(is_occlusion(query));
-
-   return agx_add_query_to_batch(batch, query, &batch->occlusion_queries);
+   agx_add_query_to_batch(batch, query);
+   return agx_oq_index(batch->ctx, query);
 }

 uint64_t
 agx_get_query_address(struct agx_batch *batch, struct agx_query *query)
 {
-   assert(!is_occlusion(query));
-
-   agx_add_query_to_batch(batch, query, &batch->nonocclusion_queries);
-
-   /* Allocate storage for the query in the batch */
-   if (!query->ptr.cpu) {
-      query->ptr = agx_pool_alloc_aligned(&batch->pool, sizeof(uint64_t),
-                                          sizeof(uint64_t));
-
-      uint64_t *value = query->ptr.cpu;
-      *value = 0;
-   }
-
+   agx_add_query_to_batch(batch, query);
   return query->ptr.gpu;
 }

@@ -377,61 +471,31 @@ void
 agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
                         uint64_t end_ts)
 {
-   uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu;
+   /* Remove the batch as write from all queries by incrementing the generation
+    * of the batch.
+    */
+   batch->ctx->batches.generation[agx_batch_idx(batch)]++;

-   util_dynarray_foreach(&batch->occlusion_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
+   /* Write out timestamps */
+   util_dynarray_foreach(&batch->timestamps, struct agx_ptr, it) {
+      uint64_t *ptr = it->cpu;

-      /* Skip queries that have since been destroyed */
-      if (query == NULL)
-         continue;
-
-      assert(query->writer == batch);
-
-      /* Get the result for this batch. If occlusion is NULL, it means that no
-       * draws actually enabled any occlusion queries, so there's no change.
-       */
-      if (occlusion != NULL) {
-         uint64_t result = *(occlusion++);
-
-         /* Accumulate with the previous result (e.g. in case we split a frame
-          * into multiple batches so an API-level query spans multiple batches).
-          */
-         if (query->type == PIPE_QUERY_OCCLUSION_COUNTER)
-            query->value += result;
-         else
-            query->value |= (!!result);
-      }
-
-      query->writer = NULL;
-      query->writer_index = 0;
+      ptr[0] = MAX2(ptr[0], end_ts);
+      ptr[1] = MIN2(ptr[1], begin_ts);
   }
+}

-   /* Now handle non-occlusion queries in a similar way */
-   util_dynarray_foreach(&batch->nonocclusion_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
-      if (query == NULL)
-         continue;
+void
+agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
+                        uint64_t increment)
+{
+   if (!query)
+      return;

-      assert(query->writer == batch);
+   sync_query_writers(ctx, query, "CPU query increment");

-      /* Accumulate */
-      uint64_t *value = query->ptr.cpu;
-      query->value += (*value);
-      query->writer = NULL;
-      query->writer_index = 0;
-      query->ptr.cpu = NULL;
-      query->ptr.gpu = 0;
-   }
-
-   util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
-      if (query == NULL)
-         continue;
-
-      query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts);
-      query->timestamp_end = MAX2(query->timestamp_end, end_ts);
-   }
+   uint64_t *value = query->ptr.cpu;
+   *value += increment;
 }

 static void
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -3974,13 +3974,11 @@ agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,

   count *= info->instance_count;

-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]->value += count;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);

-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]->value += count;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
 }

 static uint64_t
@@ -4676,10 +4674,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
      return;

   /* TCS invocation counter increments once per-patch */
-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]->value +=
-         in_patches;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
+      in_patches);

   struct agx_batch *batch = agx_get_compute_batch(ctx);
   agx_batch_init_state(batch);
@@ -4830,10 +4827,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
      desc[4] = 0;                                  /* start_instance */

      /* TES invocation counter increments once per tessellated vertex */
-      if (ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]) {
-         ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]->value +=
-            data.num_domain_points;
-      }
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
+         data.num_domain_points);
   }
   p_tess_destroy(tess);

@@ -5419,7 +5415,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
      unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
      unsigned count = workgroups * blocksize;

-      ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]->value += count;
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count);
   }

   struct agx_batch *batch = agx_get_compute_batch(ctx);
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -372,15 +372,8 @@ struct agx_batch {
   /* Scissor and depth-bias descriptors, uploaded at GPU time */
   struct util_dynarray scissor, depth_bias;

-   /* Indexed occlusion queries within the occlusion buffer, and the occlusion
-    * buffer itself which is allocated at submit time.
-    */
-   struct util_dynarray occlusion_queries;
-   struct agx_ptr occlusion_buffer;
-
-   /* Non-occlusion queries */
-   struct util_dynarray nonocclusion_queries;
-   struct util_dynarray timestamp_queries;
+   /* Arrays of GPU pointers that should be written with the batch timestamps */
+   struct util_dynarray timestamps;

   /* Result buffer where the kernel places command execution information */
   union agx_batch_result *result;
@@ -562,6 +555,8 @@ struct asahi_blitter {
   void *saved_cs;
 };

+struct agx_oq_heap;
+
 struct agx_context {
   struct pipe_context base;
   struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
@@ -570,6 +565,9 @@ struct agx_context {
   /* Heap for dynamic memory allocation for geometry/tessellation shaders */
   struct pipe_resource *heap;

+   /* Occlusion query heap */
+   struct agx_oq_heap *oq;
+
   /* Acts as a context-level shader key */
   bool support_lod_bias;
   bool robust;
@@ -586,6 +584,12 @@ struct agx_context {

      /** Set of submitted batches for faster traversal */
      BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
+
+      /* Monotonic counter for each batch incremented when resetting a batch to
+       * invalidate all associated queries. Compared to
+       * agx_query::writer_generation.
+       */
+      uint64_t generation[AGX_MAX_BATCHES];
   } batches;

   struct agx_batch *batch;
@@ -656,6 +660,12 @@ struct agx_context {
   struct agx_scratch scratch_cs;
 };

+static inline unsigned
+agx_batch_idx(struct agx_batch *batch)
+{
+   return batch - batch->ctx->batches.slots;
+}
+
 static void
 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
 {
@@ -759,30 +769,9 @@ struct agx_query {
   unsigned type;
   unsigned index;

-   /* Invariant for occlusion queries:
-    *
-    *    writer != NULL => writer->occlusion_queries[writer_index] == this, and
-    *    writer == NULL => no batch such that this in batch->occlusion_queries
-    */
-   struct agx_batch *writer;
-   unsigned writer_index;
-
-   /* For GPU queries other than occlusion queries, the value of the query as
-    * written by the `writer` if a writer is non-NULL, and irrelevant otherwise.
-    * When flushing the query, this value is read and added to agx_query::value.
-    */
+   uint64_t writer_generation[AGX_MAX_BATCHES];
+   struct agx_bo *bo;
   struct agx_ptr ptr;
-
-   /* Accumulator flushed to the CPU */
-   union {
-      uint64_t value;
-      uint64_t timestamp_end;
-   };
-
-   /* For time elapsed queries, end is in the above union for consistent
-    * handling witn timestamp queries.
-    */
-   uint64_t timestamp_begin;
 };

 struct agx_sampler_state {
@@ -1046,6 +1035,9 @@ void agx_batch_add_timestamp_query(struct agx_batch *batch,
                                   struct agx_query *q);
 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);

+void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
+                             uint64_t increment);
+
 /* Blit shaders */
 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
                      bool render_cond);
@@ -1072,6 +1064,7 @@ uint64_t agx_build_meta(struct agx_batch *batch, bool store,
 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
 uint64_t agx_get_query_address(struct agx_batch *batch,
                               struct agx_query *query);
+uint64_t agx_get_occlusion_heap(struct agx_batch *batch);

 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
                              uint64_t end_ts);
--- a/src/gallium/drivers/asahi/agx_streamout.c
+++ b/src/gallium/drivers/asahi/agx_streamout.c
@@ -171,8 +171,8 @@ agx_primitives_update_direct(struct agx_context *ctx,
   assert(!ctx->stage[PIPE_SHADER_GEOMETRY].shader &&
          "Geometry shaders use their own counting");

-   ctx->prims_generated[0]->value +=
-      xfb_prims_for_vertices(info->mode, draw->count);
+   agx_query_increment_cpu(ctx, ctx->prims_generated[0],
+                           xfb_prims_for_vertices(info->mode, draw->count));
 }

 void