asahi: rewrite queries

1. always keep the query in gpu memory, so we can implement qbos properly.

2. use a lightweight data structure for tracking writers to reduce overhead

3. allow many writers per query to eliminate stalls

4. use context-wide occlusion heap, to satisfy #1 without introducing
   flushes or silly copies. this is what the pvr mesa driver does :-)

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
This commit is contained in:
Alyssa Rosenzweig
2024-01-21 15:02:10 -04:00
committed by Marge Bot
parent ca58bc239a
commit 23b4503225
6 changed files with 285 additions and 263 deletions

View File

@@ -24,12 +24,6 @@
agx_msg("[Batch %u] " fmt "\n", agx_batch_idx(batch), ##__VA_ARGS__); \
} while (0)
static unsigned
agx_batch_idx(struct agx_batch *batch)
{
return batch - batch->ctx->batches.slots;
}
bool
agx_batch_is_active(struct agx_batch *batch)
{
@@ -125,9 +119,7 @@ agx_batch_init(struct agx_context *ctx,
util_dynarray_init(&batch->scissor, ctx);
util_dynarray_init(&batch->depth_bias, ctx);
util_dynarray_init(&batch->occlusion_queries, ctx);
util_dynarray_init(&batch->nonocclusion_queries, ctx);
util_dynarray_init(&batch->timestamp_queries, ctx);
util_dynarray_init(&batch->timestamps, ctx);
batch->clear = 0;
batch->draw = 0;
@@ -177,8 +169,6 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
uint64_t begin_ts = ~0, end_ts = 0;
/* TODO: UAPI pending */
agx_finish_batch_queries(batch, begin_ts, end_ts);
batch->occlusion_buffer.cpu = NULL;
batch->occlusion_buffer.gpu = 0;
if (reset) {
int handle;
@@ -212,9 +202,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
util_dynarray_fini(&batch->scissor);
util_dynarray_fini(&batch->depth_bias);
util_dynarray_fini(&batch->occlusion_queries);
util_dynarray_fini(&batch->nonocclusion_queries);
util_dynarray_fini(&batch->timestamp_queries);
util_dynarray_fini(&batch->timestamps);
if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
agx_batch_print_stats(dev, batch);
@@ -774,13 +762,6 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
agx_batch_cleanup(ctx, batch, true);
}
void
agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
{
if (q)
util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q);
}
/*
* Timestamp queries record the time after all current work is finished,
* which we handle as the time after all current batches finish (since we're a

View File

@@ -1333,19 +1333,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
*/
agx_batch_add_bo(batch, batch->vdm.bo);
/* Occlusion queries are allocated as a contiguous pool */
unsigned oq_count =
util_dynarray_num_elements(&batch->occlusion_queries, struct agx_query *);
size_t oq_size = oq_count * sizeof(uint64_t);
if (oq_size) {
batch->occlusion_buffer =
agx_pool_alloc_aligned(&batch->pool, oq_size, 64);
memset(batch->occlusion_buffer.cpu, 0, oq_size);
} else {
batch->occlusion_buffer.gpu = 0;
}
if (batch->vs_scratch)
agx_batch_add_bo(batch, ctx->scratch_vs.buf);
if (batch->fs_scratch)

View File

@@ -6,24 +6,16 @@
#include <stdint.h>
#include "pipe/p_defines.h"
#include "util/bitset.h"
#include "util/macros.h"
#include "util/ralloc.h"
#include "util/u_inlines.h"
#include "util/u_prim.h"
#include "agx_bo.h"
#include "agx_device.h"
#include "agx_state.h"
#include "pool.h"
static struct pipe_query *
agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
{
struct agx_query *query = calloc(1, sizeof(struct agx_query));
query->type = query_type;
query->index = index;
return (struct pipe_query *)query;
}
static bool
is_occlusion(struct agx_query *query)
{
@@ -49,40 +41,184 @@ is_timer(struct agx_query *query)
}
}
#define AGX_MAX_OCCLUSION_QUERIES (65536)
struct agx_oq_heap {
/* The GPU allocation itself */
struct agx_bo *bo;
/* Bitset of query indices that are in use */
BITSET_DECLARE(available, AGX_MAX_OCCLUSION_QUERIES);
};
static void
agx_destroy_oq_heap(void *heap_)
{
struct agx_oq_heap *heap = heap_;
agx_bo_unreference(heap->bo);
}
static struct agx_oq_heap *
agx_alloc_oq_heap(struct agx_context *ctx)
{
struct agx_oq_heap *heap = rzalloc(ctx, struct agx_oq_heap);
ralloc_set_destructor(heap, agx_destroy_oq_heap);
heap->bo = agx_bo_create(agx_device(ctx->base.screen),
AGX_MAX_OCCLUSION_QUERIES * sizeof(uint64_t),
AGX_BO_WRITEBACK, "Occlusion query heap");
/* At the start, everything is available */
BITSET_ONES(heap->available);
return heap;
}
static struct agx_oq_heap *
agx_get_oq_heap(struct agx_context *ctx)
{
if (!ctx->oq)
ctx->oq = agx_alloc_oq_heap(ctx);
return ctx->oq;
}
static struct agx_ptr
agx_alloc_oq(struct agx_context *ctx)
{
struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
/* Find first available */
int ffs = BITSET_FFS(heap->available);
if (!ffs)
return (struct agx_ptr){NULL, 0};
/* Allocate it */
unsigned index = ffs - 1;
BITSET_CLEAR(heap->available, index);
unsigned offset = index * sizeof(uint64_t);
return (struct agx_ptr){
(uint8_t *)heap->bo->ptr.cpu + offset,
heap->bo->ptr.gpu + offset,
};
}
static unsigned
agx_oq_index(struct agx_context *ctx, struct agx_query *q)
{
assert(is_occlusion(q));
return (q->ptr.gpu - ctx->oq->bo->ptr.gpu) / sizeof(uint64_t);
}
static void
agx_free_oq(struct agx_context *ctx, struct agx_query *q)
{
struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
unsigned index = agx_oq_index(ctx, q);
assert(index < AGX_MAX_OCCLUSION_QUERIES);
assert(!BITSET_TEST(heap->available, index));
BITSET_SET(heap->available, index);
}
uint64_t
agx_get_occlusion_heap(struct agx_batch *batch)
{
if (!batch->ctx->oq)
return 0;
struct agx_bo *bo = batch->ctx->oq->bo;
if (agx_batch_uses_bo(batch, bo))
return bo->ptr.gpu;
else
return 0;
}
static struct pipe_query *
agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
{
struct agx_query *query = calloc(1, sizeof(struct agx_query));
query->type = query_type;
query->index = index;
/* Set all writer generations to a sentinel that will always compare as
* false, since nothing writes to no queries.
*/
for (unsigned i = 0; i < ARRAY_SIZE(query->writer_generation); ++i) {
query->writer_generation[i] = UINT64_MAX;
}
if (is_occlusion(query)) {
query->ptr = agx_alloc_oq(agx_context(ctx));
} else {
/* TODO: a BO for the query is wasteful, but we benefit from BO list
* tracking / reference counting to deal with lifetimes.
*/
query->bo = agx_bo_create(agx_device(ctx->screen), sizeof(uint64_t) * 2,
AGX_BO_WRITEBACK, "Query");
query->ptr = query->bo->ptr;
}
if (!query->ptr.gpu) {
free(query);
return NULL;
}
return (struct pipe_query *)query;
}
static void
sync_query_writers(struct agx_context *ctx, struct agx_query *query,
const char *reason)
{
STATIC_ASSERT(ARRAY_SIZE(ctx->batches.generation) == AGX_MAX_BATCHES);
STATIC_ASSERT(ARRAY_SIZE(ctx->batches.slots) == AGX_MAX_BATCHES);
STATIC_ASSERT(ARRAY_SIZE(query->writer_generation) == AGX_MAX_BATCHES);
for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
if (query->writer_generation[i] == ctx->batches.generation[i])
agx_sync_batch_for_reason(ctx, &ctx->batches.slots[i], reason);
}
}
static bool
is_query_busy(struct agx_context *ctx, struct agx_query *query)
{
for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
if (query->writer_generation[i] == ctx->batches.generation[i])
return true;
}
return false;
}
static void
agx_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
{
struct agx_context *ctx = agx_context(pctx);
struct agx_query *query = (struct agx_query *)pquery;
/* It is legal for the query to be destroyed before its value is read,
* particularly during application teardown. In this case, don't leave a
* dangling reference to the query.
/* We don't reference count the occlusion query allocations, so we need to
* sync writers when destroying so we can freely write from the CPU after
* it's destroyed, since the driver will assume an available query is idle.
*
* For other queries, the BO itself is reference counted after the pipe_query
* is destroyed so we don't need to flush.
*/
if (query->writer) {
assert(!is_timer(query) && "single writer not used here");
struct agx_batch *writer = query->writer;
struct util_dynarray *array = is_occlusion(query)
? &writer->occlusion_queries
: &writer->nonocclusion_queries;
struct agx_query **ptr =
util_dynarray_element(array, struct agx_query *, query->writer_index);
assert((*ptr) == query && "data structure invariant");
*ptr = NULL;
} else if (is_timer(query)) {
/* Potentially has many writers! We need them all to synchronize so they
* don't have dangling references. Syncing will destroy batches that hold
* references as required.
*
* TODO: Optimize this, timestamp queries are bonkers on tilers.
*/
agx_flush_all(ctx, "Destroying time query");
agx_sync_all(ctx, "Destroying time query");
if (is_occlusion(query)) {
sync_query_writers(ctx, query, "Occlusion query destroy");
agx_free_oq(ctx, query);
} else {
agx_bo_unreference(query->bo);
}
free(query);
free(pquery);
}
static bool
@@ -118,9 +254,7 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
case PIPE_QUERY_TIME_ELAPSED:
ctx->time_elapsed = query;
query->timestamp_begin = UINT64_MAX;
query->timestamp_end = 0;
return true;
break;
case PIPE_QUERY_TIMESTAMP:
/* No-op */
@@ -135,17 +269,17 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
return false;
}
/* begin_query zeroes, flush so we can do that write. If anything (i.e.
* other than piglit) actually hits this, we could shadow the query to
* avoid the flush.
*/
if (query->writer) {
agx_flush_batch_for_reason(ctx, query->writer, "Query overwritten");
agx_sync_batch_for_reason(ctx, query->writer, "Query overwrriten");
/* begin_query zeroes, sync so we can do that write from the CPU */
sync_query_writers(ctx, query, "Query overwritten");
uint64_t *ptr = query->ptr.cpu;
ptr[0] = 0;
if (query->type == PIPE_QUERY_TIME_ELAPSED) {
/* Timestamp begin in second record, the timestamp end in the first */
ptr[1] = UINT64_MAX;
}
assert(query->writer == NULL);
query->value = 0;
return true;
}
@@ -183,15 +317,17 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
assert(query->index < ARRAY_SIZE(ctx->pipeline_statistics));
ctx->pipeline_statistics[query->index] = NULL;
return true;
case PIPE_QUERY_TIMESTAMP:
case PIPE_QUERY_TIMESTAMP: {
/* Timestamp logically written now, set up batches to MAX their finish
* time in. If there are no batches, it's just the current time stamp.
*/
agx_add_timestamp_end_query(ctx, query);
query->timestamp_end = agx_get_gpu_timestamp(dev);
uint64_t *value = query->ptr.cpu;
*value = agx_get_gpu_timestamp(dev);
return true;
}
default:
return false;
}
@@ -205,56 +341,37 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
struct agx_context *ctx = agx_context(pctx);
struct agx_device *dev = agx_device(pctx->screen);
/* For GPU queries, flush the writer. When the writer is flushed, the GPU
* will write the value, and when we wait for the writer, the CPU will read
* the value into query->value.
*/
if (query->writer != NULL) {
/* Querying the result forces a query to finish in finite time, so we
* need to flush. Furthermore, we need all earlier queries
* to finish before this query, so we sync unconditionally (so we can
* maintain the lie that all queries are finished when read).
*
* TODO: Optimize based on wait flag.
*/
struct agx_batch *writer = query->writer;
agx_flush_batch_for_reason(ctx, writer, "GPU query");
agx_sync_batch_for_reason(ctx, writer, "GPU query");
} else if (query->type == PIPE_QUERY_TIMESTAMP ||
query->type == PIPE_QUERY_TIME_ELAPSED) {
/* TODO: Optimize this... timestamp queries are bonkers on tilers. */
agx_flush_all(ctx, "Timestamp query");
agx_sync_all(ctx, "Timestamp query");
}
/* TODO: Honour `wait` */
sync_query_writers(ctx, query, "Reading query results");
/* After syncing, there is no writer left, so query->value is ready */
assert(query->writer == NULL && "cleared when cleaning up batch");
uint64_t *ptr = query->ptr.cpu;
uint64_t value = *ptr;
switch (query->type) {
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
vresult->b = query->value;
vresult->b = value;
return true;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
vresult->b = query->value > 0;
vresult->b = value > 0;
return true;
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
vresult->u64 = query->value;
vresult->u64 = value;
return true;
case PIPE_QUERY_TIMESTAMP:
vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end);
vresult->u64 = agx_gpu_time_to_ns(dev, value);
return true;
case PIPE_QUERY_TIME_ELAPSED:
vresult->u64 =
agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin);
/* end - begin */
vresult->u64 = agx_gpu_time_to_ns(dev, ptr[0] - ptr[1]);
return true;
default:
@@ -272,26 +389,25 @@ agx_get_query_result_resource(struct pipe_context *pipe, struct pipe_query *q,
/* TODO: Don't cheat XXX */
struct agx_context *ctx = agx_context(pipe);
agx_sync_all(ctx, "Stubbed QBOs");
union pipe_query_result result;
if (index < 0) {
/* availability */
result.u64 = 1;
result.u64 = !is_query_busy(ctx, query);
} else {
bool ready = agx_get_query_result(pipe, q, true, &result);
assert(ready);
}
switch (query->type) {
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
result.u64 = result.b;
break;
default:
break;
switch (query->type) {
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
result.u64 = result.b;
break;
default:
break;
}
}
/* Clamp to type, arb_query_buffer_object-qbo tests */
@@ -318,58 +434,36 @@ agx_set_active_query_state(struct pipe_context *pipe, bool enable)
ctx->dirty |= AGX_DIRTY_QUERY;
}
static uint16_t
agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query,
struct util_dynarray *array)
static void
agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query)
{
/* If written by another batch, flush it now. If this affects real apps, we
* could avoid this flush by merging query results.
*/
if (query->writer && query->writer != batch) {
agx_sync_batch_for_reason(batch->ctx, query->writer,
"Multiple query writers");
unsigned idx = agx_batch_idx(batch);
struct agx_bo *bo = is_occlusion(query) ? batch->ctx->oq->bo : query->bo;
agx_batch_add_bo(batch, bo);
query->writer_generation[idx] = batch->ctx->batches.generation[idx];
}
void
agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
{
if (q) {
agx_add_query_to_batch(batch, q);
util_dynarray_append(&batch->timestamps, struct agx_ptr, q->ptr);
}
/* Allocate if needed */
if (query->writer == NULL) {
query->writer = batch;
query->writer_index =
util_dynarray_num_elements(array, struct agx_query *);
util_dynarray_append(array, struct agx_query *, query);
}
assert(query->writer == batch);
assert(*util_dynarray_element(array, struct agx_query *,
query->writer_index) == query);
return query->writer_index;
}
uint16_t
agx_get_oq_index(struct agx_batch *batch, struct agx_query *query)
{
assert(is_occlusion(query));
return agx_add_query_to_batch(batch, query, &batch->occlusion_queries);
agx_add_query_to_batch(batch, query);
return agx_oq_index(batch->ctx, query);
}
uint64_t
agx_get_query_address(struct agx_batch *batch, struct agx_query *query)
{
assert(!is_occlusion(query));
agx_add_query_to_batch(batch, query, &batch->nonocclusion_queries);
/* Allocate storage for the query in the batch */
if (!query->ptr.cpu) {
query->ptr = agx_pool_alloc_aligned(&batch->pool, sizeof(uint64_t),
sizeof(uint64_t));
uint64_t *value = query->ptr.cpu;
*value = 0;
}
agx_add_query_to_batch(batch, query);
return query->ptr.gpu;
}
@@ -377,61 +471,31 @@ void
agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
uint64_t end_ts)
{
uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu;
/* Remove the batch as write from all queries by incrementing the generation
* of the batch.
*/
batch->ctx->batches.generation[agx_batch_idx(batch)]++;
util_dynarray_foreach(&batch->occlusion_queries, struct agx_query *, it) {
struct agx_query *query = *it;
/* Write out timestamps */
util_dynarray_foreach(&batch->timestamps, struct agx_ptr, it) {
uint64_t *ptr = it->cpu;
/* Skip queries that have since been destroyed */
if (query == NULL)
continue;
assert(query->writer == batch);
/* Get the result for this batch. If occlusion is NULL, it means that no
* draws actually enabled any occlusion queries, so there's no change.
*/
if (occlusion != NULL) {
uint64_t result = *(occlusion++);
/* Accumulate with the previous result (e.g. in case we split a frame
* into multiple batches so an API-level query spans multiple batches).
*/
if (query->type == PIPE_QUERY_OCCLUSION_COUNTER)
query->value += result;
else
query->value |= (!!result);
}
query->writer = NULL;
query->writer_index = 0;
ptr[0] = MAX2(ptr[0], end_ts);
ptr[1] = MIN2(ptr[1], begin_ts);
}
}
/* Now handle non-occlusion queries in a similar way */
util_dynarray_foreach(&batch->nonocclusion_queries, struct agx_query *, it) {
struct agx_query *query = *it;
if (query == NULL)
continue;
void
agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
uint64_t increment)
{
if (!query)
return;
assert(query->writer == batch);
sync_query_writers(ctx, query, "CPU query increment");
/* Accumulate */
uint64_t *value = query->ptr.cpu;
query->value += (*value);
query->writer = NULL;
query->writer_index = 0;
query->ptr.cpu = NULL;
query->ptr.gpu = 0;
}
util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) {
struct agx_query *query = *it;
if (query == NULL)
continue;
query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts);
query->timestamp_end = MAX2(query->timestamp_end, end_ts);
}
uint64_t *value = query->ptr.cpu;
*value += increment;
}
static void

View File

@@ -3974,13 +3974,11 @@ agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
count *= info->instance_count;
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]) {
ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]->value += count;
}
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) {
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]->value += count;
}
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
}
static uint64_t
@@ -4676,10 +4674,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
return;
/* TCS invocation counter increments once per-patch */
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]) {
ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]->value +=
in_patches;
}
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
in_patches);
struct agx_batch *batch = agx_get_compute_batch(ctx);
agx_batch_init_state(batch);
@@ -4830,10 +4827,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
desc[4] = 0; /* start_instance */
/* TES invocation counter increments once per tessellated vertex */
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]) {
ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]->value +=
data.num_domain_points;
}
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
data.num_domain_points);
}
p_tess_destroy(tess);
@@ -5419,7 +5415,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
unsigned count = workgroups * blocksize;
ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]->value += count;
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count);
}
struct agx_batch *batch = agx_get_compute_batch(ctx);

View File

@@ -372,15 +372,8 @@ struct agx_batch {
/* Scissor and depth-bias descriptors, uploaded at GPU time */
struct util_dynarray scissor, depth_bias;
/* Indexed occlusion queries within the occlusion buffer, and the occlusion
* buffer itself which is allocated at submit time.
*/
struct util_dynarray occlusion_queries;
struct agx_ptr occlusion_buffer;
/* Non-occlusion queries */
struct util_dynarray nonocclusion_queries;
struct util_dynarray timestamp_queries;
/* Arrays of GPU pointers that should be written with the batch timestamps */
struct util_dynarray timestamps;
/* Result buffer where the kernel places command execution information */
union agx_batch_result *result;
@@ -562,6 +555,8 @@ struct asahi_blitter {
void *saved_cs;
};
struct agx_oq_heap;
struct agx_context {
struct pipe_context base;
struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
@@ -570,6 +565,9 @@ struct agx_context {
/* Heap for dynamic memory allocation for geometry/tessellation shaders */
struct pipe_resource *heap;
/* Occlusion query heap */
struct agx_oq_heap *oq;
/* Acts as a context-level shader key */
bool support_lod_bias;
bool robust;
@@ -586,6 +584,12 @@ struct agx_context {
/** Set of submitted batches for faster traversal */
BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
/* Monotonic counter for each batch incremented when resetting a batch to
* invalidate all associated queries. Compared to
* agx_query::writer_generation.
*/
uint64_t generation[AGX_MAX_BATCHES];
} batches;
struct agx_batch *batch;
@@ -656,6 +660,12 @@ struct agx_context {
struct agx_scratch scratch_cs;
};
static inline unsigned
agx_batch_idx(struct agx_batch *batch)
{
return batch - batch->ctx->batches.slots;
}
static void
agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
{
@@ -759,30 +769,9 @@ struct agx_query {
unsigned type;
unsigned index;
/* Invariant for occlusion queries:
*
* writer != NULL => writer->occlusion_queries[writer_index] == this, and
* writer == NULL => no batch such that this in batch->occlusion_queries
*/
struct agx_batch *writer;
unsigned writer_index;
/* For GPU queries other than occlusion queries, the value of the query as
* written by the `writer` if a writer is non-NULL, and irrelevant otherwise.
* When flushing the query, this value is read and added to agx_query::value.
*/
uint64_t writer_generation[AGX_MAX_BATCHES];
struct agx_bo *bo;
struct agx_ptr ptr;
/* Accumulator flushed to the CPU */
union {
uint64_t value;
uint64_t timestamp_end;
};
/* For time elapsed queries, end is in the above union for consistent
* handling witn timestamp queries.
*/
uint64_t timestamp_begin;
};
struct agx_sampler_state {
@@ -1046,6 +1035,9 @@ void agx_batch_add_timestamp_query(struct agx_batch *batch,
struct agx_query *q);
void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
uint64_t increment);
/* Blit shaders */
void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
bool render_cond);
@@ -1072,6 +1064,7 @@ uint64_t agx_build_meta(struct agx_batch *batch, bool store,
uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
uint64_t agx_get_query_address(struct agx_batch *batch,
struct agx_query *query);
uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
uint64_t end_ts);

View File

@@ -171,8 +171,8 @@ agx_primitives_update_direct(struct agx_context *ctx,
assert(!ctx->stage[PIPE_SHADER_GEOMETRY].shader &&
"Geometry shaders use their own counting");
ctx->prims_generated[0]->value +=
xfb_prims_for_vertices(info->mode, draw->count);
agx_query_increment_cpu(ctx, ctx->prims_generated[0],
xfb_prims_for_vertices(info->mode, draw->count));
}
void