diff --git a/src/gallium/drivers/asahi/agx_batch.c b/src/gallium/drivers/asahi/agx_batch.c
index 557b3f15890..3d75cdc58ff 100644
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@@ -24,12 +24,6 @@
          agx_msg("[Batch %u] " fmt "\n", agx_batch_idx(batch), ##__VA_ARGS__); \
    } while (0)
 
-static unsigned
-agx_batch_idx(struct agx_batch *batch)
-{
-   return batch - batch->ctx->batches.slots;
-}
-
 bool
 agx_batch_is_active(struct agx_batch *batch)
 {
@@ -125,9 +119,7 @@ agx_batch_init(struct agx_context *ctx,
 
    util_dynarray_init(&batch->scissor, ctx);
    util_dynarray_init(&batch->depth_bias, ctx);
-   util_dynarray_init(&batch->occlusion_queries, ctx);
-   util_dynarray_init(&batch->nonocclusion_queries, ctx);
-   util_dynarray_init(&batch->timestamp_queries, ctx);
+   util_dynarray_init(&batch->timestamps, ctx);
 
    batch->clear = 0;
    batch->draw = 0;
@@ -177,8 +169,6 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
    uint64_t begin_ts = ~0, end_ts = 0;
    /* TODO: UAPI pending */
    agx_finish_batch_queries(batch, begin_ts, end_ts);
-   batch->occlusion_buffer.cpu = NULL;
-   batch->occlusion_buffer.gpu = 0;
 
    if (reset) {
       int handle;
@@ -212,9 +202,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
 
    util_dynarray_fini(&batch->scissor);
    util_dynarray_fini(&batch->depth_bias);
-   util_dynarray_fini(&batch->occlusion_queries);
-   util_dynarray_fini(&batch->nonocclusion_queries);
-   util_dynarray_fini(&batch->timestamp_queries);
+   util_dynarray_fini(&batch->timestamps);
 
    if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
       agx_batch_print_stats(dev, batch);
@@ -774,13 +762,6 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
    agx_batch_cleanup(ctx, batch, true);
 }
 
-void
-agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
-{
-   if (q)
-      util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q);
-}
-
 /*
  * Timestamp queries record the time after all current work is finished,
  * which we handle as the time after all current batches finish (since we're a
diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c
index 58a05eee432..5889c50a71d 100644
--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@@ -1333,19 +1333,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
     */
    agx_batch_add_bo(batch, batch->vdm.bo);
 
-   /* Occlusion queries are allocated as a contiguous pool */
-   unsigned oq_count =
-      util_dynarray_num_elements(&batch->occlusion_queries, struct agx_query *);
-   size_t oq_size = oq_count * sizeof(uint64_t);
-
-   if (oq_size) {
-      batch->occlusion_buffer =
-         agx_pool_alloc_aligned(&batch->pool, oq_size, 64);
-      memset(batch->occlusion_buffer.cpu, 0, oq_size);
-   } else {
-      batch->occlusion_buffer.gpu = 0;
-   }
-
    if (batch->vs_scratch)
       agx_batch_add_bo(batch, ctx->scratch_vs.buf);
    if (batch->fs_scratch)
diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c
index 193fb92f06f..9a04ae3267b 100644
--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@@ -6,24 +6,16 @@
 
 #include <stdint.h>
 #include "pipe/p_defines.h"
+#include "util/bitset.h"
 #include "util/macros.h"
+#include "util/ralloc.h"
 #include "util/u_inlines.h"
 #include "util/u_prim.h"
+#include "agx_bo.h"
 #include "agx_device.h"
 #include "agx_state.h"
 #include "pool.h"
 
-static struct pipe_query *
-agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
-{
-   struct agx_query *query = calloc(1, sizeof(struct agx_query));
-
-   query->type = query_type;
-   query->index = index;
-
-   return (struct pipe_query *)query;
-}
-
 static bool
 is_occlusion(struct agx_query *query)
 {
@@ -49,40 +41,184 @@ is_timer(struct agx_query *query)
    }
 }
 
+#define AGX_MAX_OCCLUSION_QUERIES (65536)
+
+struct agx_oq_heap {
+   /* The GPU allocation itself */
+   struct agx_bo *bo;
+
+   /* Bitset of query indices that are in use */
+   BITSET_DECLARE(available, AGX_MAX_OCCLUSION_QUERIES);
+};
+
+static void
+agx_destroy_oq_heap(void *heap_)
+{
+   struct agx_oq_heap *heap = heap_;
+   agx_bo_unreference(heap->bo);
+}
+
+static struct agx_oq_heap *
+agx_alloc_oq_heap(struct agx_context *ctx)
+{
+   struct agx_oq_heap *heap = rzalloc(ctx, struct agx_oq_heap);
+   ralloc_set_destructor(heap, agx_destroy_oq_heap);
+
+   heap->bo = agx_bo_create(agx_device(ctx->base.screen),
+                            AGX_MAX_OCCLUSION_QUERIES * sizeof(uint64_t),
+                            AGX_BO_WRITEBACK, "Occlusion query heap");
+
+   /* At the start, everything is available */
+   BITSET_ONES(heap->available);
+
+   return heap;
+}
+
+static struct agx_oq_heap *
+agx_get_oq_heap(struct agx_context *ctx)
+{
+   if (!ctx->oq)
+      ctx->oq = agx_alloc_oq_heap(ctx);
+
+   return ctx->oq;
+}
+
+static struct agx_ptr
+agx_alloc_oq(struct agx_context *ctx)
+{
+   struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
+
+   /* Find first available */
+   int ffs = BITSET_FFS(heap->available);
+   if (!ffs)
+      return (struct agx_ptr){NULL, 0};
+
+   /* Allocate it */
+   unsigned index = ffs - 1;
+   BITSET_CLEAR(heap->available, index);
+
+   unsigned offset = index * sizeof(uint64_t);
+
+   return (struct agx_ptr){
+      (uint8_t *)heap->bo->ptr.cpu + offset,
+      heap->bo->ptr.gpu + offset,
+   };
+}
+
+static unsigned
+agx_oq_index(struct agx_context *ctx, struct agx_query *q)
+{
+   assert(is_occlusion(q));
+
+   return (q->ptr.gpu - ctx->oq->bo->ptr.gpu) / sizeof(uint64_t);
+}
+
+static void
+agx_free_oq(struct agx_context *ctx, struct agx_query *q)
+{
+   struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
+   unsigned index = agx_oq_index(ctx, q);
+
+   assert(index < AGX_MAX_OCCLUSION_QUERIES);
+   assert(!BITSET_TEST(heap->available, index));
+
+   BITSET_SET(heap->available, index);
+}
+
+uint64_t
+agx_get_occlusion_heap(struct agx_batch *batch)
+{
+   if (!batch->ctx->oq)
+      return 0;
+
+   struct agx_bo *bo = batch->ctx->oq->bo;
+
+   if (agx_batch_uses_bo(batch, bo))
+      return bo->ptr.gpu;
+   else
+      return 0;
+}
+
+static struct pipe_query *
+agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
+{
+   struct agx_query *query = calloc(1, sizeof(struct agx_query));
+
+   query->type = query_type;
+   query->index = index;
+
+   /* Set all writer generations to a sentinel that will always compare as
+    * false, since nothing writes to no queries.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(query->writer_generation); ++i) {
+      query->writer_generation[i] = UINT64_MAX;
+   }
+
+   if (is_occlusion(query)) {
+      query->ptr = agx_alloc_oq(agx_context(ctx));
+   } else {
+      /* TODO: a BO for the query is wasteful, but we benefit from BO list
+       * tracking / reference counting to deal with lifetimes.
+       */
+      query->bo = agx_bo_create(agx_device(ctx->screen), sizeof(uint64_t) * 2,
+                                AGX_BO_WRITEBACK, "Query");
+      query->ptr = query->bo->ptr;
+   }
+
+   if (!query->ptr.gpu) {
+      free(query);
+      return NULL;
+   }
+
+   return (struct pipe_query *)query;
+}
+
+static void
+sync_query_writers(struct agx_context *ctx, struct agx_query *query,
+                   const char *reason)
+{
+   STATIC_ASSERT(ARRAY_SIZE(ctx->batches.generation) == AGX_MAX_BATCHES);
+   STATIC_ASSERT(ARRAY_SIZE(ctx->batches.slots) == AGX_MAX_BATCHES);
+   STATIC_ASSERT(ARRAY_SIZE(query->writer_generation) == AGX_MAX_BATCHES);
+
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (query->writer_generation[i] == ctx->batches.generation[i])
+         agx_sync_batch_for_reason(ctx, &ctx->batches.slots[i], reason);
+   }
+}
+
+static bool
+is_query_busy(struct agx_context *ctx, struct agx_query *query)
+{
+   for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
+      if (query->writer_generation[i] == ctx->batches.generation[i])
+         return true;
+   }
+
+   return false;
+}
+
 static void
 agx_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
 {
    struct agx_context *ctx = agx_context(pctx);
    struct agx_query *query = (struct agx_query *)pquery;
 
-   /* It is legal for the query to be destroyed before its value is read,
-    * particularly during application teardown. In this case, don't leave a
-    * dangling reference to the query.
+   /* We don't reference count the occlusion query allocations, so we need to
+    * sync writers when destroying so we can freely write from the CPU after
+    * it's destroyed, since the driver will assume an available query is idle.
+    *
+    * For other queries, the BO itself is reference counted after the pipe_query
+    * is destroyed so we don't need to flush.
     */
-   if (query->writer) {
-      assert(!is_timer(query) && "single writer not used here");
-
-      struct agx_batch *writer = query->writer;
-      struct util_dynarray *array = is_occlusion(query)
-                                       ? &writer->occlusion_queries
-                                       : &writer->nonocclusion_queries;
-      struct agx_query **ptr =
-         util_dynarray_element(array, struct agx_query *, query->writer_index);
-
-      assert((*ptr) == query && "data structure invariant");
-      *ptr = NULL;
-   } else if (is_timer(query)) {
-      /* Potentially has many writers! We need them all to synchronize so they
-       * don't have dangling references. Syncing will destroy batches that hold
-       * references as required.
-       *
-       * TODO: Optimize this, timestamp queries are bonkers on tilers.
-       */
-      agx_flush_all(ctx, "Destroying time query");
-      agx_sync_all(ctx, "Destroying time query");
+   if (is_occlusion(query)) {
+      sync_query_writers(ctx, query, "Occlusion query destroy");
+      agx_free_oq(ctx, query);
+   } else {
+      agx_bo_unreference(query->bo);
    }
 
-   free(query);
+   free(pquery);
 }
 
 static bool
@@ -118,9 +254,7 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
 
    case PIPE_QUERY_TIME_ELAPSED:
       ctx->time_elapsed = query;
-      query->timestamp_begin = UINT64_MAX;
-      query->timestamp_end = 0;
-      return true;
+      break;
 
    case PIPE_QUERY_TIMESTAMP:
       /* No-op */
@@ -135,17 +269,17 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
       return false;
    }
 
-   /* begin_query zeroes, flush so we can do that write. If anything (i.e.
-    * other than piglit) actually hits this, we could shadow the query to
-    * avoid the flush.
-    */
-   if (query->writer) {
-      agx_flush_batch_for_reason(ctx, query->writer, "Query overwritten");
-      agx_sync_batch_for_reason(ctx, query->writer, "Query overwrriten");
+   /* begin_query zeroes, sync so we can do that write from the CPU */
+   sync_query_writers(ctx, query, "Query overwritten");
+
+   uint64_t *ptr = query->ptr.cpu;
+   ptr[0] = 0;
+
+   if (query->type == PIPE_QUERY_TIME_ELAPSED) {
+      /* Timestamp begin in second record, the timestamp end in the first */
+      ptr[1] = UINT64_MAX;
    }
 
-   assert(query->writer == NULL);
-   query->value = 0;
    return true;
 }
 
@@ -183,15 +317,17 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
       assert(query->index < ARRAY_SIZE(ctx->pipeline_statistics));
       ctx->pipeline_statistics[query->index] = NULL;
       return true;
-   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP: {
       /* Timestamp logically written now, set up batches to MAX their finish
        * time in. If there are no batches, it's just the current time stamp.
        */
       agx_add_timestamp_end_query(ctx, query);
 
-      query->timestamp_end = agx_get_gpu_timestamp(dev);
+      uint64_t *value = query->ptr.cpu;
+      *value = agx_get_gpu_timestamp(dev);
 
       return true;
+   }
    default:
       return false;
    }
@@ -205,56 +341,37 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
    struct agx_context *ctx = agx_context(pctx);
    struct agx_device *dev = agx_device(pctx->screen);
 
-   /* For GPU queries, flush the writer. When the writer is flushed, the GPU
-    * will write the value, and when we wait for the writer, the CPU will read
-    * the value into query->value.
-    */
-   if (query->writer != NULL) {
-      /* Querying the result forces a query to finish in finite time, so we
-       * need to flush. Furthermore, we need all earlier queries
-       * to finish before this query, so we sync unconditionally (so we can
-       * maintain the lie that all queries are finished when read).
-       *
-       * TODO: Optimize based on wait flag.
-       */
-      struct agx_batch *writer = query->writer;
-      agx_flush_batch_for_reason(ctx, writer, "GPU query");
-      agx_sync_batch_for_reason(ctx, writer, "GPU query");
-   } else if (query->type == PIPE_QUERY_TIMESTAMP ||
-              query->type == PIPE_QUERY_TIME_ELAPSED) {
-      /* TODO: Optimize this... timestamp queries are bonkers on tilers. */
-      agx_flush_all(ctx, "Timestamp query");
-      agx_sync_all(ctx, "Timestamp query");
-   }
+   /* TODO: Honour `wait` */
+   sync_query_writers(ctx, query, "Reading query results");
 
-   /* After syncing, there is no writer left, so query->value is ready */
-   assert(query->writer == NULL && "cleared when cleaning up batch");
+   uint64_t *ptr = query->ptr.cpu;
+   uint64_t value = *ptr;
 
    switch (query->type) {
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-      vresult->b = query->value;
+      vresult->b = value;
       return true;
 
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-      vresult->b = query->value > 0;
+      vresult->b = value > 0;
       return true;
 
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
    case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
-      vresult->u64 = query->value;
+      vresult->u64 = value;
       return true;
 
    case PIPE_QUERY_TIMESTAMP:
-      vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end);
+      vresult->u64 = agx_gpu_time_to_ns(dev, value);
       return true;
 
    case PIPE_QUERY_TIME_ELAPSED:
-      vresult->u64 =
-         agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin);
+      /* end - begin */
+      vresult->u64 = agx_gpu_time_to_ns(dev, ptr[0] - ptr[1]);
       return true;
 
    default:
@@ -272,26 +389,25 @@ agx_get_query_result_resource(struct pipe_context *pipe, struct pipe_query *q,
 
    /* TODO: Don't cheat XXX */
    struct agx_context *ctx = agx_context(pipe);
-   agx_sync_all(ctx, "Stubbed QBOs");
 
    union pipe_query_result result;
    if (index < 0) {
       /* availability */
-      result.u64 = 1;
+      result.u64 = !is_query_busy(ctx, query);
    } else {
       bool ready = agx_get_query_result(pipe, q, true, &result);
       assert(ready);
-   }
 
-   switch (query->type) {
-   case PIPE_QUERY_OCCLUSION_PREDICATE:
-   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-      result.u64 = result.b;
-      break;
-   default:
-      break;
+      switch (query->type) {
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+         result.u64 = result.b;
+         break;
+      default:
+         break;
+      }
    }
 
    /* Clamp to type, arb_query_buffer_object-qbo tests */
@@ -318,58 +434,36 @@ agx_set_active_query_state(struct pipe_context *pipe, bool enable)
    ctx->dirty |= AGX_DIRTY_QUERY;
 }
 
-static uint16_t
-agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query,
-                       struct util_dynarray *array)
+static void
+agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query)
 {
-   /* If written by another batch, flush it now. If this affects real apps, we
-    * could avoid this flush by merging query results.
-    */
-   if (query->writer && query->writer != batch) {
-      agx_sync_batch_for_reason(batch->ctx, query->writer,
-                                "Multiple query writers");
+   unsigned idx = agx_batch_idx(batch);
+   struct agx_bo *bo = is_occlusion(query) ? batch->ctx->oq->bo : query->bo;
+
+   agx_batch_add_bo(batch, bo);
+   query->writer_generation[idx] = batch->ctx->batches.generation[idx];
+}
+
+void
+agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
+{
+   if (q) {
+      agx_add_query_to_batch(batch, q);
+      util_dynarray_append(&batch->timestamps, struct agx_ptr, q->ptr);
    }
-
-   /* Allocate if needed */
-   if (query->writer == NULL) {
-      query->writer = batch;
-      query->writer_index =
-         util_dynarray_num_elements(array, struct agx_query *);
-
-      util_dynarray_append(array, struct agx_query *, query);
-   }
-
-   assert(query->writer == batch);
-   assert(*util_dynarray_element(array, struct agx_query *,
-                                 query->writer_index) == query);
-
-   return query->writer_index;
 }
 
 uint16_t
 agx_get_oq_index(struct agx_batch *batch, struct agx_query *query)
 {
-   assert(is_occlusion(query));
-
-   return agx_add_query_to_batch(batch, query, &batch->occlusion_queries);
+   agx_add_query_to_batch(batch, query);
+   return agx_oq_index(batch->ctx, query);
 }
 
 uint64_t
 agx_get_query_address(struct agx_batch *batch, struct agx_query *query)
 {
-   assert(!is_occlusion(query));
-
-   agx_add_query_to_batch(batch, query, &batch->nonocclusion_queries);
-
-   /* Allocate storage for the query in the batch */
-   if (!query->ptr.cpu) {
-      query->ptr = agx_pool_alloc_aligned(&batch->pool, sizeof(uint64_t),
-                                          sizeof(uint64_t));
-
-      uint64_t *value = query->ptr.cpu;
-      *value = 0;
-   }
-
+   agx_add_query_to_batch(batch, query);
    return query->ptr.gpu;
 }
 
@@ -377,61 +471,31 @@ void
 agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
                          uint64_t end_ts)
 {
-   uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu;
+   /* Remove the batch as write from all queries by incrementing the generation
+    * of the batch.
+    */
+   batch->ctx->batches.generation[agx_batch_idx(batch)]++;
 
-   util_dynarray_foreach(&batch->occlusion_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
+   /* Write out timestamps */
+   util_dynarray_foreach(&batch->timestamps, struct agx_ptr, it) {
+      uint64_t *ptr = it->cpu;
 
-      /* Skip queries that have since been destroyed */
-      if (query == NULL)
-         continue;
-
-      assert(query->writer == batch);
-
-      /* Get the result for this batch. If occlusion is NULL, it means that no
-       * draws actually enabled any occlusion queries, so there's no change.
-       */
-      if (occlusion != NULL) {
-         uint64_t result = *(occlusion++);
-
-         /* Accumulate with the previous result (e.g. in case we split a frame
-          * into multiple batches so an API-level query spans multiple batches).
-          */
-         if (query->type == PIPE_QUERY_OCCLUSION_COUNTER)
-            query->value += result;
-         else
-            query->value |= (!!result);
-      }
-
-      query->writer = NULL;
-      query->writer_index = 0;
+      ptr[0] = MAX2(ptr[0], end_ts);
+      ptr[1] = MIN2(ptr[1], begin_ts);
    }
+}
 
-   /* Now handle non-occlusion queries in a similar way */
-   util_dynarray_foreach(&batch->nonocclusion_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
-      if (query == NULL)
-         continue;
+void
+agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
+                        uint64_t increment)
+{
+   if (!query)
+      return;
 
-      assert(query->writer == batch);
+   sync_query_writers(ctx, query, "CPU query increment");
 
-      /* Accumulate */
-      uint64_t *value = query->ptr.cpu;
-      query->value += (*value);
-      query->writer = NULL;
-      query->writer_index = 0;
-      query->ptr.cpu = NULL;
-      query->ptr.gpu = 0;
-   }
-
-   util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) {
-      struct agx_query *query = *it;
-      if (query == NULL)
-         continue;
-
-      query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts);
-      query->timestamp_end = MAX2(query->timestamp_end, end_ts);
-   }
+   uint64_t *value = query->ptr.cpu;
+   *value += increment;
 }
 
 static void
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index 7a062901daf..2b797562850 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -3974,13 +3974,11 @@ agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
 
    count *= info->instance_count;
 
-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]->value += count;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
 
-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]->value += count;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
 }
 
 static uint64_t
@@ -4676,10 +4674,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       return;
 
    /* TCS invocation counter increments once per-patch */
-   if (ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]) {
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]->value +=
-         in_patches;
-   }
+   agx_query_increment_cpu(
+      ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
+      in_patches);
 
    struct agx_batch *batch = agx_get_compute_batch(ctx);
    agx_batch_init_state(batch);
@@ -4830,10 +4827,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
       desc[4] = 0;                                  /* start_instance */
 
       /* TES invocation counter increments once per tessellated vertex */
-      if (ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]) {
-         ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]->value +=
-            data.num_domain_points;
-      }
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
+         data.num_domain_points);
    }
    p_tess_destroy(tess);
 
@@ -5419,7 +5415,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
       unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
       unsigned count = workgroups * blocksize;
 
-      ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]->value += count;
+      agx_query_increment_cpu(
+         ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count);
    }
 
    struct agx_batch *batch = agx_get_compute_batch(ctx);
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 157d02565f5..2eb0e0d88f0 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -372,15 +372,8 @@ struct agx_batch {
    /* Scissor and depth-bias descriptors, uploaded at GPU time */
    struct util_dynarray scissor, depth_bias;
 
-   /* Indexed occlusion queries within the occlusion buffer, and the occlusion
-    * buffer itself which is allocated at submit time.
-    */
-   struct util_dynarray occlusion_queries;
-   struct agx_ptr occlusion_buffer;
-
-   /* Non-occlusion queries */
-   struct util_dynarray nonocclusion_queries;
-   struct util_dynarray timestamp_queries;
+   /* Arrays of GPU pointers that should be written with the batch timestamps */
+   struct util_dynarray timestamps;
 
    /* Result buffer where the kernel places command execution information */
    union agx_batch_result *result;
@@ -562,6 +555,8 @@ struct asahi_blitter {
    void *saved_cs;
 };
 
+struct agx_oq_heap;
+
 struct agx_context {
    struct pipe_context base;
    struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
@@ -570,6 +565,9 @@ struct agx_context {
    /* Heap for dynamic memory allocation for geometry/tessellation shaders */
    struct pipe_resource *heap;
 
+   /* Occlusion query heap */
+   struct agx_oq_heap *oq;
+
    /* Acts as a context-level shader key */
    bool support_lod_bias;
    bool robust;
@@ -586,6 +584,12 @@ struct agx_context {
 
       /** Set of submitted batches for faster traversal */
       BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
+
+      /* Monotonic counter for each batch incremented when resetting a batch to
+       * invalidate all associated queries. Compared to
+       * agx_query::writer_generation.
+       */
+      uint64_t generation[AGX_MAX_BATCHES];
    } batches;
 
    struct agx_batch *batch;
@@ -656,6 +660,12 @@ struct agx_context {
    struct agx_scratch scratch_cs;
 };
 
+static inline unsigned
+agx_batch_idx(struct agx_batch *batch)
+{
+   return batch - batch->ctx->batches.slots;
+}
+
 static void
 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
 {
@@ -759,30 +769,9 @@ struct agx_query {
    unsigned type;
    unsigned index;
 
-   /* Invariant for occlusion queries:
-    *
-    *    writer != NULL => writer->occlusion_queries[writer_index] == this, and
-    *    writer == NULL => no batch such that this in batch->occlusion_queries
-    */
-   struct agx_batch *writer;
-   unsigned writer_index;
-
-   /* For GPU queries other than occlusion queries, the value of the query as
-    * written by the `writer` if a writer is non-NULL, and irrelevant otherwise.
-    * When flushing the query, this value is read and added to agx_query::value.
-    */
+   uint64_t writer_generation[AGX_MAX_BATCHES];
+   struct agx_bo *bo;
    struct agx_ptr ptr;
-
-   /* Accumulator flushed to the CPU */
-   union {
-      uint64_t value;
-      uint64_t timestamp_end;
-   };
-
-   /* For time elapsed queries, end is in the above union for consistent
-    * handling witn timestamp queries.
-    */
-   uint64_t timestamp_begin;
 };
 
 struct agx_sampler_state {
@@ -1046,6 +1035,9 @@ void agx_batch_add_timestamp_query(struct agx_batch *batch,
                                    struct agx_query *q);
 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
 
+void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
+                             uint64_t increment);
+
 /* Blit shaders */
 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
                       bool render_cond);
@@ -1072,6 +1064,7 @@ uint64_t agx_build_meta(struct agx_batch *batch, bool store,
 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
 uint64_t agx_get_query_address(struct agx_batch *batch,
                                struct agx_query *query);
+uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
 
 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
                               uint64_t end_ts);
diff --git a/src/gallium/drivers/asahi/agx_streamout.c b/src/gallium/drivers/asahi/agx_streamout.c
index 1b03aea388f..3389415badf 100644
--- a/src/gallium/drivers/asahi/agx_streamout.c
+++ b/src/gallium/drivers/asahi/agx_streamout.c
@@ -171,8 +171,8 @@ agx_primitives_update_direct(struct agx_context *ctx,
    assert(!ctx->stage[PIPE_SHADER_GEOMETRY].shader &&
           "Geometry shaders use their own counting");
 
-   ctx->prims_generated[0]->value +=
-      xfb_prims_for_vertices(info->mode, draw->count);
+   agx_query_increment_cpu(ctx, ctx->prims_generated[0],
+                           xfb_prims_for_vertices(info->mode, draw->count));
 }
 
 void