diff --git a/src/gallium/drivers/asahi/agx_batch.c b/src/gallium/drivers/asahi/agx_batch.c index 557b3f15890..3d75cdc58ff 100644 --- a/src/gallium/drivers/asahi/agx_batch.c +++ b/src/gallium/drivers/asahi/agx_batch.c @@ -24,12 +24,6 @@ agx_msg("[Batch %u] " fmt "\n", agx_batch_idx(batch), ##__VA_ARGS__); \ } while (0) -static unsigned -agx_batch_idx(struct agx_batch *batch) -{ - return batch - batch->ctx->batches.slots; -} - bool agx_batch_is_active(struct agx_batch *batch) { @@ -125,9 +119,7 @@ agx_batch_init(struct agx_context *ctx, util_dynarray_init(&batch->scissor, ctx); util_dynarray_init(&batch->depth_bias, ctx); - util_dynarray_init(&batch->occlusion_queries, ctx); - util_dynarray_init(&batch->nonocclusion_queries, ctx); - util_dynarray_init(&batch->timestamp_queries, ctx); + util_dynarray_init(&batch->timestamps, ctx); batch->clear = 0; batch->draw = 0; @@ -177,8 +169,6 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset) uint64_t begin_ts = ~0, end_ts = 0; /* TODO: UAPI pending */ agx_finish_batch_queries(batch, begin_ts, end_ts); - batch->occlusion_buffer.cpu = NULL; - batch->occlusion_buffer.gpu = 0; if (reset) { int handle; @@ -212,9 +202,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset) util_dynarray_fini(&batch->scissor); util_dynarray_fini(&batch->depth_bias); - util_dynarray_fini(&batch->occlusion_queries); - util_dynarray_fini(&batch->nonocclusion_queries); - util_dynarray_fini(&batch->timestamp_queries); + util_dynarray_fini(&batch->timestamps); if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) { agx_batch_print_stats(dev, batch); @@ -774,13 +762,6 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch) agx_batch_cleanup(ctx, batch, true); } -void -agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q) -{ - if (q) - util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q); -} - /* * Timestamp queries record the time after all current work is finished, * which we handle as the time after all current batches finish (since we're a diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c index 58a05eee432..5889c50a71d 100644 --- a/src/gallium/drivers/asahi/agx_pipe.c +++ b/src/gallium/drivers/asahi/agx_pipe.c @@ -1333,19 +1333,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch) */ agx_batch_add_bo(batch, batch->vdm.bo); - /* Occlusion queries are allocated as a contiguous pool */ - unsigned oq_count = - util_dynarray_num_elements(&batch->occlusion_queries, struct agx_query *); - size_t oq_size = oq_count * sizeof(uint64_t); - - if (oq_size) { - batch->occlusion_buffer = - agx_pool_alloc_aligned(&batch->pool, oq_size, 64); - memset(batch->occlusion_buffer.cpu, 0, oq_size); - } else { - batch->occlusion_buffer.gpu = 0; - } - if (batch->vs_scratch) agx_batch_add_bo(batch, ctx->scratch_vs.buf); if (batch->fs_scratch) diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c index 193fb92f06f..9a04ae3267b 100644 --- a/src/gallium/drivers/asahi/agx_query.c +++ b/src/gallium/drivers/asahi/agx_query.c @@ -6,24 +6,16 @@ #include #include "pipe/p_defines.h" +#include "util/bitset.h" #include "util/macros.h" +#include "util/ralloc.h" #include "util/u_inlines.h" #include "util/u_prim.h" +#include "agx_bo.h" #include "agx_device.h" #include "agx_state.h" #include "pool.h" -static struct pipe_query * -agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) -{ - struct agx_query *query = calloc(1, sizeof(struct agx_query)); - - query->type = query_type; - query->index = index; - - return (struct pipe_query *)query; -} - static bool is_occlusion(struct agx_query *query) { @@ -49,40 +41,184 @@ is_timer(struct agx_query *query) } } +#define AGX_MAX_OCCLUSION_QUERIES (65536) + +struct agx_oq_heap { + /* The GPU allocation itself */ + struct agx_bo *bo; + + /* Bitset of query indices that are in use */ + BITSET_DECLARE(available, AGX_MAX_OCCLUSION_QUERIES); +}; + +static void +agx_destroy_oq_heap(void *heap_) +{ + struct agx_oq_heap *heap = heap_; + agx_bo_unreference(heap->bo); +} + +static struct agx_oq_heap * +agx_alloc_oq_heap(struct agx_context *ctx) +{ + struct agx_oq_heap *heap = rzalloc(ctx, struct agx_oq_heap); + ralloc_set_destructor(heap, agx_destroy_oq_heap); + + heap->bo = agx_bo_create(agx_device(ctx->base.screen), + AGX_MAX_OCCLUSION_QUERIES * sizeof(uint64_t), + AGX_BO_WRITEBACK, "Occlusion query heap"); + + /* At the start, everything is available */ + BITSET_ONES(heap->available); + + return heap; +} + +static struct agx_oq_heap * +agx_get_oq_heap(struct agx_context *ctx) +{ + if (!ctx->oq) + ctx->oq = agx_alloc_oq_heap(ctx); + + return ctx->oq; +} + +static struct agx_ptr +agx_alloc_oq(struct agx_context *ctx) +{ + struct agx_oq_heap *heap = agx_get_oq_heap(ctx); + + /* Find first available */ + int ffs = BITSET_FFS(heap->available); + if (!ffs) + return (struct agx_ptr){NULL, 0}; + + /* Allocate it */ + unsigned index = ffs - 1; + BITSET_CLEAR(heap->available, index); + + unsigned offset = index * sizeof(uint64_t); + + return (struct agx_ptr){ + (uint8_t *)heap->bo->ptr.cpu + offset, + heap->bo->ptr.gpu + offset, + }; +} + +static unsigned +agx_oq_index(struct agx_context *ctx, struct agx_query *q) +{ + assert(is_occlusion(q)); + + return (q->ptr.gpu - ctx->oq->bo->ptr.gpu) / sizeof(uint64_t); +} + +static void +agx_free_oq(struct agx_context *ctx, struct agx_query *q) +{ + struct agx_oq_heap *heap = agx_get_oq_heap(ctx); + unsigned index = agx_oq_index(ctx, q); + + assert(index < AGX_MAX_OCCLUSION_QUERIES); + assert(!BITSET_TEST(heap->available, index)); + + BITSET_SET(heap->available, index); +} + +uint64_t +agx_get_occlusion_heap(struct agx_batch *batch) +{ + if (!batch->ctx->oq) + return 0; + + struct agx_bo *bo = batch->ctx->oq->bo; + + if (agx_batch_uses_bo(batch, bo)) + return bo->ptr.gpu; + else + return 0; +} + +static struct pipe_query * +agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +{ + struct agx_query *query = calloc(1, sizeof(struct agx_query)); + + query->type = query_type; + query->index = index; + + /* Set all writer generations to a sentinel that will always compare as + * false, since nothing writes to no queries. + */ + for (unsigned i = 0; i < ARRAY_SIZE(query->writer_generation); ++i) { + query->writer_generation[i] = UINT64_MAX; + } + + if (is_occlusion(query)) { + query->ptr = agx_alloc_oq(agx_context(ctx)); + } else { + /* TODO: a BO for the query is wasteful, but we benefit from BO list + * tracking / reference counting to deal with lifetimes. + */ + query->bo = agx_bo_create(agx_device(ctx->screen), sizeof(uint64_t) * 2, + AGX_BO_WRITEBACK, "Query"); + query->ptr = query->bo->ptr; + } + + if (!query->ptr.gpu) { + free(query); + return NULL; + } + + return (struct pipe_query *)query; +} + +static void +sync_query_writers(struct agx_context *ctx, struct agx_query *query, + const char *reason) +{ + STATIC_ASSERT(ARRAY_SIZE(ctx->batches.generation) == AGX_MAX_BATCHES); + STATIC_ASSERT(ARRAY_SIZE(ctx->batches.slots) == AGX_MAX_BATCHES); + STATIC_ASSERT(ARRAY_SIZE(query->writer_generation) == AGX_MAX_BATCHES); + + for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) { + if (query->writer_generation[i] == ctx->batches.generation[i]) + agx_sync_batch_for_reason(ctx, &ctx->batches.slots[i], reason); + } +} + +static bool +is_query_busy(struct agx_context *ctx, struct agx_query *query) +{ + for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) { + if (query->writer_generation[i] == ctx->batches.generation[i]) + return true; + } + + return false; +} + static void agx_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery) { struct agx_context *ctx = agx_context(pctx); struct agx_query *query = (struct agx_query *)pquery; - /* It is legal for the query to be destroyed before its value is read, - * particularly during application teardown. In this case, don't leave a - * dangling reference to the query. + /* We don't reference count the occlusion query allocations, so we need to + * sync writers when destroying so we can freely write from the CPU after + * it's destroyed, since the driver will assume an available query is idle. + * + * For other queries, the BO itself is reference counted after the pipe_query + * is destroyed so we don't need to flush. */ - if (query->writer) { - assert(!is_timer(query) && "single writer not used here"); - - struct agx_batch *writer = query->writer; - struct util_dynarray *array = is_occlusion(query) - ? &writer->occlusion_queries - : &writer->nonocclusion_queries; - struct agx_query **ptr = - util_dynarray_element(array, struct agx_query *, query->writer_index); - - assert((*ptr) == query && "data structure invariant"); - *ptr = NULL; - } else if (is_timer(query)) { - /* Potentially has many writers! We need them all to synchronize so they - * don't have dangling references. Syncing will destroy batches that hold - * references as required. - * - * TODO: Optimize this, timestamp queries are bonkers on tilers. - */ - agx_flush_all(ctx, "Destroying time query"); - agx_sync_all(ctx, "Destroying time query"); + if (is_occlusion(query)) { + sync_query_writers(ctx, query, "Occlusion query destroy"); + agx_free_oq(ctx, query); + } else { + agx_bo_unreference(query->bo); } - free(query); + free(pquery); } static bool @@ -118,9 +254,7 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) case PIPE_QUERY_TIME_ELAPSED: ctx->time_elapsed = query; - query->timestamp_begin = UINT64_MAX; - query->timestamp_end = 0; - return true; + break; case PIPE_QUERY_TIMESTAMP: /* No-op */ @@ -135,17 +269,17 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) return false; } - /* begin_query zeroes, flush so we can do that write. If anything (i.e. - * other than piglit) actually hits this, we could shadow the query to - * avoid the flush. - */ - if (query->writer) { - agx_flush_batch_for_reason(ctx, query->writer, "Query overwritten"); - agx_sync_batch_for_reason(ctx, query->writer, "Query overwrriten"); + /* begin_query zeroes, sync so we can do that write from the CPU */ + sync_query_writers(ctx, query, "Query overwritten"); + + uint64_t *ptr = query->ptr.cpu; + ptr[0] = 0; + + if (query->type == PIPE_QUERY_TIME_ELAPSED) { + /* Timestamp begin in second record, the timestamp end in the first */ + ptr[1] = UINT64_MAX; } - assert(query->writer == NULL); - query->value = 0; return true; } @@ -183,15 +317,17 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery) assert(query->index < ARRAY_SIZE(ctx->pipeline_statistics)); ctx->pipeline_statistics[query->index] = NULL; return true; - case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_TIMESTAMP: { /* Timestamp logically written now, set up batches to MAX their finish * time in. If there are no batches, it's just the current time stamp. */ agx_add_timestamp_end_query(ctx, query); - query->timestamp_end = agx_get_gpu_timestamp(dev); + uint64_t *value = query->ptr.cpu; + *value = agx_get_gpu_timestamp(dev); return true; + } default: return false; } @@ -205,56 +341,37 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, struct agx_context *ctx = agx_context(pctx); struct agx_device *dev = agx_device(pctx->screen); - /* For GPU queries, flush the writer. When the writer is flushed, the GPU - * will write the value, and when we wait for the writer, the CPU will read - * the value into query->value. - */ - if (query->writer != NULL) { - /* Querying the result forces a query to finish in finite time, so we - * need to flush. Furthermore, we need all earlier queries - * to finish before this query, so we sync unconditionally (so we can - * maintain the lie that all queries are finished when read). - * - * TODO: Optimize based on wait flag. - */ - struct agx_batch *writer = query->writer; - agx_flush_batch_for_reason(ctx, writer, "GPU query"); - agx_sync_batch_for_reason(ctx, writer, "GPU query"); - } else if (query->type == PIPE_QUERY_TIMESTAMP || - query->type == PIPE_QUERY_TIME_ELAPSED) { - /* TODO: Optimize this... timestamp queries are bonkers on tilers. */ - agx_flush_all(ctx, "Timestamp query"); - agx_sync_all(ctx, "Timestamp query"); - } + /* TODO: Honour `wait` */ + sync_query_writers(ctx, query, "Reading query results"); - /* After syncing, there is no writer left, so query->value is ready */ - assert(query->writer == NULL && "cleared when cleaning up batch"); + uint64_t *ptr = query->ptr.cpu; + uint64_t value = *ptr; switch (query->type) { case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - vresult->b = query->value; + vresult->b = value; return true; case PIPE_QUERY_SO_OVERFLOW_PREDICATE: case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - vresult->b = query->value > 0; + vresult->b = value > 0; return true; case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_PRIMITIVES_GENERATED: case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: - vresult->u64 = query->value; + vresult->u64 = value; return true; case PIPE_QUERY_TIMESTAMP: - vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end); + vresult->u64 = agx_gpu_time_to_ns(dev, value); return true; case PIPE_QUERY_TIME_ELAPSED: - vresult->u64 = - agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin); + /* end - begin */ + vresult->u64 = agx_gpu_time_to_ns(dev, ptr[0] - ptr[1]); return true; default: @@ -272,26 +389,25 @@ agx_get_query_result_resource(struct pipe_context *pipe, struct pipe_query *q, /* TODO: Don't cheat XXX */ struct agx_context *ctx = agx_context(pipe); - agx_sync_all(ctx, "Stubbed QBOs"); union pipe_query_result result; if (index < 0) { /* availability */ - result.u64 = 1; + result.u64 = !is_query_busy(ctx, query); } else { bool ready = agx_get_query_result(pipe, q, true, &result); assert(ready); - } - switch (query->type) { - case PIPE_QUERY_OCCLUSION_PREDICATE: - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - result.u64 = result.b; - break; - default: - break; + switch (query->type) { + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: + result.u64 = result.b; + break; + default: + break; + } } /* Clamp to type, arb_query_buffer_object-qbo tests */ @@ -318,58 +434,36 @@ agx_set_active_query_state(struct pipe_context *pipe, bool enable) ctx->dirty |= AGX_DIRTY_QUERY; } -static uint16_t -agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query, - struct util_dynarray *array) +static void +agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query) { - /* If written by another batch, flush it now. If this affects real apps, we - * could avoid this flush by merging query results. - */ - if (query->writer && query->writer != batch) { - agx_sync_batch_for_reason(batch->ctx, query->writer, - "Multiple query writers"); + unsigned idx = agx_batch_idx(batch); + struct agx_bo *bo = is_occlusion(query) ? batch->ctx->oq->bo : query->bo; + + agx_batch_add_bo(batch, bo); + query->writer_generation[idx] = batch->ctx->batches.generation[idx]; +} + +void +agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q) +{ + if (q) { + agx_add_query_to_batch(batch, q); + util_dynarray_append(&batch->timestamps, struct agx_ptr, q->ptr); } - - /* Allocate if needed */ - if (query->writer == NULL) { - query->writer = batch; - query->writer_index = - util_dynarray_num_elements(array, struct agx_query *); - - util_dynarray_append(array, struct agx_query *, query); - } - - assert(query->writer == batch); - assert(*util_dynarray_element(array, struct agx_query *, - query->writer_index) == query); - - return query->writer_index; } uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query) { - assert(is_occlusion(query)); - - return agx_add_query_to_batch(batch, query, &batch->occlusion_queries); + agx_add_query_to_batch(batch, query); + return agx_oq_index(batch->ctx, query); } uint64_t agx_get_query_address(struct agx_batch *batch, struct agx_query *query) { - assert(!is_occlusion(query)); - - agx_add_query_to_batch(batch, query, &batch->nonocclusion_queries); - - /* Allocate storage for the query in the batch */ - if (!query->ptr.cpu) { - query->ptr = agx_pool_alloc_aligned(&batch->pool, sizeof(uint64_t), - sizeof(uint64_t)); - - uint64_t *value = query->ptr.cpu; - *value = 0; - } - + agx_add_query_to_batch(batch, query); return query->ptr.gpu; } @@ -377,61 +471,31 @@ void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts, uint64_t end_ts) { - uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu; + /* Remove the batch as write from all queries by incrementing the generation + * of the batch. + */ + batch->ctx->batches.generation[agx_batch_idx(batch)]++; - util_dynarray_foreach(&batch->occlusion_queries, struct agx_query *, it) { - struct agx_query *query = *it; + /* Write out timestamps */ + util_dynarray_foreach(&batch->timestamps, struct agx_ptr, it) { + uint64_t *ptr = it->cpu; - /* Skip queries that have since been destroyed */ - if (query == NULL) - continue; - - assert(query->writer == batch); - - /* Get the result for this batch. If occlusion is NULL, it means that no - * draws actually enabled any occlusion queries, so there's no change. - */ - if (occlusion != NULL) { - uint64_t result = *(occlusion++); - - /* Accumulate with the previous result (e.g. in case we split a frame - * into multiple batches so an API-level query spans multiple batches). - */ - if (query->type == PIPE_QUERY_OCCLUSION_COUNTER) - query->value += result; - else - query->value |= (!!result); - } - - query->writer = NULL; - query->writer_index = 0; + ptr[0] = MAX2(ptr[0], end_ts); + ptr[1] = MIN2(ptr[1], begin_ts); } +} - /* Now handle non-occlusion queries in a similar way */ - util_dynarray_foreach(&batch->nonocclusion_queries, struct agx_query *, it) { - struct agx_query *query = *it; - if (query == NULL) - continue; +void +agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query, + uint64_t increment) +{ + if (!query) + return; - assert(query->writer == batch); + sync_query_writers(ctx, query, "CPU query increment"); - /* Accumulate */ - uint64_t *value = query->ptr.cpu; - query->value += (*value); - query->writer = NULL; - query->writer_index = 0; - query->ptr.cpu = NULL; - query->ptr.gpu = 0; - } - - util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) { - struct agx_query *query = *it; - if (query == NULL) - continue; - - query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts); - query->timestamp_end = MAX2(query->timestamp_end, end_ts); - } + uint64_t *value = query->ptr.cpu; + *value += increment; } static void diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 7a062901daf..2b797562850 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -3974,13 +3974,11 @@ agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info, count *= info->instance_count; - if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]) { - ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]->value += count; - } + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count); - if (ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) { - ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]->value += count; - } + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count); } static uint64_t @@ -4676,10 +4674,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, return; /* TCS invocation counter increments once per-patch */ - if (ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]) { - ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]->value += - in_patches; - } + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS], + in_patches); struct agx_batch *batch = agx_get_compute_batch(ctx); agx_batch_init_state(batch); @@ -4830,10 +4827,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, desc[4] = 0; /* start_instance */ /* TES invocation counter increments once per tessellated vertex */ - if (ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]) { - ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]->value += - data.num_domain_points; - } + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS], + data.num_domain_points); } p_tess_destroy(tess); @@ -5419,7 +5415,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) unsigned blocksize = info->block[0] * info->block[1] * info->block[2]; unsigned count = workgroups * blocksize; - ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]->value += count; + agx_query_increment_cpu( + ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count); } struct agx_batch *batch = agx_get_compute_batch(ctx); diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 157d02565f5..2eb0e0d88f0 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -372,15 +372,8 @@ struct agx_batch { /* Scissor and depth-bias descriptors, uploaded at GPU time */ struct util_dynarray scissor, depth_bias; - /* Indexed occlusion queries within the occlusion buffer, and the occlusion - * buffer itself which is allocated at submit time. - */ - struct util_dynarray occlusion_queries; - struct agx_ptr occlusion_buffer; - - /* Non-occlusion queries */ - struct util_dynarray nonocclusion_queries; - struct util_dynarray timestamp_queries; + /* Arrays of GPU pointers that should be written with the batch timestamps */ + struct util_dynarray timestamps; /* Result buffer where the kernel places command execution information */ union agx_batch_result *result; @@ -562,6 +555,8 @@ struct asahi_blitter { void *saved_cs; }; +struct agx_oq_heap; + struct agx_context { struct pipe_context base; struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes; @@ -570,6 +565,9 @@ struct agx_context { /* Heap for dynamic memory allocation for geometry/tessellation shaders */ struct pipe_resource *heap; + /* Occlusion query heap */ + struct agx_oq_heap *oq; + /* Acts as a context-level shader key */ bool support_lod_bias; bool robust; @@ -586,6 +584,12 @@ struct agx_context { /** Set of submitted batches for faster traversal */ BITSET_DECLARE(submitted, AGX_MAX_BATCHES); + + /* Monotonic counter for each batch incremented when resetting a batch to + * invalidate all associated queries. Compared to + * agx_query::writer_generation. + */ + uint64_t generation[AGX_MAX_BATCHES]; } batches; struct agx_batch *batch; @@ -656,6 +660,12 @@ struct agx_context { struct agx_scratch scratch_cs; }; +static inline unsigned +agx_batch_idx(struct agx_batch *batch) +{ + return batch - batch->ctx->batches.slots; +} + static void agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle) { @@ -759,30 +769,9 @@ struct agx_query { unsigned type; unsigned index; - /* Invariant for occlusion queries: - * - * writer != NULL => writer->occlusion_queries[writer_index] == this, and - * writer == NULL => no batch such that this in batch->occlusion_queries - */ - struct agx_batch *writer; - unsigned writer_index; - - /* For GPU queries other than occlusion queries, the value of the query as - * written by the `writer` if a writer is non-NULL, and irrelevant otherwise. - * When flushing the query, this value is read and added to agx_query::value. - */ + uint64_t writer_generation[AGX_MAX_BATCHES]; + struct agx_bo *bo; struct agx_ptr ptr; - - /* Accumulator flushed to the CPU */ - union { - uint64_t value; - uint64_t timestamp_end; - }; - - /* For time elapsed queries, end is in the above union for consistent - * handling witn timestamp queries. - */ - uint64_t timestamp_begin; }; struct agx_sampler_state { @@ -1046,6 +1035,9 @@ void agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q); void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q); +void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query, + uint64_t increment); + /* Blit shaders */ void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter, bool render_cond); @@ -1072,6 +1064,7 @@ uint64_t agx_build_meta(struct agx_batch *batch, bool store, uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query); uint64_t agx_get_query_address(struct agx_batch *batch, struct agx_query *query); +uint64_t agx_get_occlusion_heap(struct agx_batch *batch); void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts, uint64_t end_ts); diff --git a/src/gallium/drivers/asahi/agx_streamout.c b/src/gallium/drivers/asahi/agx_streamout.c index 1b03aea388f..3389415badf 100644 --- a/src/gallium/drivers/asahi/agx_streamout.c +++ b/src/gallium/drivers/asahi/agx_streamout.c @@ -171,8 +171,8 @@ agx_primitives_update_direct(struct agx_context *ctx, assert(!ctx->stage[PIPE_SHADER_GEOMETRY].shader && "Geometry shaders use their own counting"); - ctx->prims_generated[0]->value += - xfb_prims_for_vertices(info->mode, draw->count); + agx_query_increment_cpu(ctx, ctx->prims_generated[0], + xfb_prims_for_vertices(info->mode, draw->count)); } void