asahi: rewrite queries
1. always keep the query in gpu memory, so we can implement qbos properly. 2. use a lightweight data structure for tracking writers to reduce overhead 3. allow many writers per query to eliminate stalls 4. use context-wide occlusion heap, to satisfy #1 without introducing flushes or silly copies. this is what the pvr mesa driver does :-) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
This commit is contained in:

committed by
Marge Bot

parent
ca58bc239a
commit
23b4503225
@@ -24,12 +24,6 @@
|
||||
agx_msg("[Batch %u] " fmt "\n", agx_batch_idx(batch), ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
static unsigned
|
||||
agx_batch_idx(struct agx_batch *batch)
|
||||
{
|
||||
return batch - batch->ctx->batches.slots;
|
||||
}
|
||||
|
||||
bool
|
||||
agx_batch_is_active(struct agx_batch *batch)
|
||||
{
|
||||
@@ -125,9 +119,7 @@ agx_batch_init(struct agx_context *ctx,
|
||||
|
||||
util_dynarray_init(&batch->scissor, ctx);
|
||||
util_dynarray_init(&batch->depth_bias, ctx);
|
||||
util_dynarray_init(&batch->occlusion_queries, ctx);
|
||||
util_dynarray_init(&batch->nonocclusion_queries, ctx);
|
||||
util_dynarray_init(&batch->timestamp_queries, ctx);
|
||||
util_dynarray_init(&batch->timestamps, ctx);
|
||||
|
||||
batch->clear = 0;
|
||||
batch->draw = 0;
|
||||
@@ -177,8 +169,6 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
|
||||
uint64_t begin_ts = ~0, end_ts = 0;
|
||||
/* TODO: UAPI pending */
|
||||
agx_finish_batch_queries(batch, begin_ts, end_ts);
|
||||
batch->occlusion_buffer.cpu = NULL;
|
||||
batch->occlusion_buffer.gpu = 0;
|
||||
|
||||
if (reset) {
|
||||
int handle;
|
||||
@@ -212,9 +202,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
|
||||
|
||||
util_dynarray_fini(&batch->scissor);
|
||||
util_dynarray_fini(&batch->depth_bias);
|
||||
util_dynarray_fini(&batch->occlusion_queries);
|
||||
util_dynarray_fini(&batch->nonocclusion_queries);
|
||||
util_dynarray_fini(&batch->timestamp_queries);
|
||||
util_dynarray_fini(&batch->timestamps);
|
||||
|
||||
if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
|
||||
agx_batch_print_stats(dev, batch);
|
||||
@@ -774,13 +762,6 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
|
||||
agx_batch_cleanup(ctx, batch, true);
|
||||
}
|
||||
|
||||
void
|
||||
agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
|
||||
{
|
||||
if (q)
|
||||
util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q);
|
||||
}
|
||||
|
||||
/*
|
||||
* Timestamp queries record the time after all current work is finished,
|
||||
* which we handle as the time after all current batches finish (since we're a
|
||||
|
@@ -1333,19 +1333,6 @@ agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch)
|
||||
*/
|
||||
agx_batch_add_bo(batch, batch->vdm.bo);
|
||||
|
||||
/* Occlusion queries are allocated as a contiguous pool */
|
||||
unsigned oq_count =
|
||||
util_dynarray_num_elements(&batch->occlusion_queries, struct agx_query *);
|
||||
size_t oq_size = oq_count * sizeof(uint64_t);
|
||||
|
||||
if (oq_size) {
|
||||
batch->occlusion_buffer =
|
||||
agx_pool_alloc_aligned(&batch->pool, oq_size, 64);
|
||||
memset(batch->occlusion_buffer.cpu, 0, oq_size);
|
||||
} else {
|
||||
batch->occlusion_buffer.gpu = 0;
|
||||
}
|
||||
|
||||
if (batch->vs_scratch)
|
||||
agx_batch_add_bo(batch, ctx->scratch_vs.buf);
|
||||
if (batch->fs_scratch)
|
||||
|
@@ -6,24 +6,16 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include "pipe/p_defines.h"
|
||||
#include "util/bitset.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/ralloc.h"
|
||||
#include "util/u_inlines.h"
|
||||
#include "util/u_prim.h"
|
||||
#include "agx_bo.h"
|
||||
#include "agx_device.h"
|
||||
#include "agx_state.h"
|
||||
#include "pool.h"
|
||||
|
||||
static struct pipe_query *
|
||||
agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
|
||||
{
|
||||
struct agx_query *query = calloc(1, sizeof(struct agx_query));
|
||||
|
||||
query->type = query_type;
|
||||
query->index = index;
|
||||
|
||||
return (struct pipe_query *)query;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_occlusion(struct agx_query *query)
|
||||
{
|
||||
@@ -49,40 +41,184 @@ is_timer(struct agx_query *query)
|
||||
}
|
||||
}
|
||||
|
||||
#define AGX_MAX_OCCLUSION_QUERIES (65536)
|
||||
|
||||
struct agx_oq_heap {
|
||||
/* The GPU allocation itself */
|
||||
struct agx_bo *bo;
|
||||
|
||||
/* Bitset of query indices that are in use */
|
||||
BITSET_DECLARE(available, AGX_MAX_OCCLUSION_QUERIES);
|
||||
};
|
||||
|
||||
static void
|
||||
agx_destroy_oq_heap(void *heap_)
|
||||
{
|
||||
struct agx_oq_heap *heap = heap_;
|
||||
agx_bo_unreference(heap->bo);
|
||||
}
|
||||
|
||||
static struct agx_oq_heap *
|
||||
agx_alloc_oq_heap(struct agx_context *ctx)
|
||||
{
|
||||
struct agx_oq_heap *heap = rzalloc(ctx, struct agx_oq_heap);
|
||||
ralloc_set_destructor(heap, agx_destroy_oq_heap);
|
||||
|
||||
heap->bo = agx_bo_create(agx_device(ctx->base.screen),
|
||||
AGX_MAX_OCCLUSION_QUERIES * sizeof(uint64_t),
|
||||
AGX_BO_WRITEBACK, "Occlusion query heap");
|
||||
|
||||
/* At the start, everything is available */
|
||||
BITSET_ONES(heap->available);
|
||||
|
||||
return heap;
|
||||
}
|
||||
|
||||
static struct agx_oq_heap *
|
||||
agx_get_oq_heap(struct agx_context *ctx)
|
||||
{
|
||||
if (!ctx->oq)
|
||||
ctx->oq = agx_alloc_oq_heap(ctx);
|
||||
|
||||
return ctx->oq;
|
||||
}
|
||||
|
||||
static struct agx_ptr
|
||||
agx_alloc_oq(struct agx_context *ctx)
|
||||
{
|
||||
struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
|
||||
|
||||
/* Find first available */
|
||||
int ffs = BITSET_FFS(heap->available);
|
||||
if (!ffs)
|
||||
return (struct agx_ptr){NULL, 0};
|
||||
|
||||
/* Allocate it */
|
||||
unsigned index = ffs - 1;
|
||||
BITSET_CLEAR(heap->available, index);
|
||||
|
||||
unsigned offset = index * sizeof(uint64_t);
|
||||
|
||||
return (struct agx_ptr){
|
||||
(uint8_t *)heap->bo->ptr.cpu + offset,
|
||||
heap->bo->ptr.gpu + offset,
|
||||
};
|
||||
}
|
||||
|
||||
static unsigned
|
||||
agx_oq_index(struct agx_context *ctx, struct agx_query *q)
|
||||
{
|
||||
assert(is_occlusion(q));
|
||||
|
||||
return (q->ptr.gpu - ctx->oq->bo->ptr.gpu) / sizeof(uint64_t);
|
||||
}
|
||||
|
||||
static void
|
||||
agx_free_oq(struct agx_context *ctx, struct agx_query *q)
|
||||
{
|
||||
struct agx_oq_heap *heap = agx_get_oq_heap(ctx);
|
||||
unsigned index = agx_oq_index(ctx, q);
|
||||
|
||||
assert(index < AGX_MAX_OCCLUSION_QUERIES);
|
||||
assert(!BITSET_TEST(heap->available, index));
|
||||
|
||||
BITSET_SET(heap->available, index);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
agx_get_occlusion_heap(struct agx_batch *batch)
|
||||
{
|
||||
if (!batch->ctx->oq)
|
||||
return 0;
|
||||
|
||||
struct agx_bo *bo = batch->ctx->oq->bo;
|
||||
|
||||
if (agx_batch_uses_bo(batch, bo))
|
||||
return bo->ptr.gpu;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct pipe_query *
|
||||
agx_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
|
||||
{
|
||||
struct agx_query *query = calloc(1, sizeof(struct agx_query));
|
||||
|
||||
query->type = query_type;
|
||||
query->index = index;
|
||||
|
||||
/* Set all writer generations to a sentinel that will always compare as
|
||||
* false, since nothing writes to no queries.
|
||||
*/
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(query->writer_generation); ++i) {
|
||||
query->writer_generation[i] = UINT64_MAX;
|
||||
}
|
||||
|
||||
if (is_occlusion(query)) {
|
||||
query->ptr = agx_alloc_oq(agx_context(ctx));
|
||||
} else {
|
||||
/* TODO: a BO for the query is wasteful, but we benefit from BO list
|
||||
* tracking / reference counting to deal with lifetimes.
|
||||
*/
|
||||
query->bo = agx_bo_create(agx_device(ctx->screen), sizeof(uint64_t) * 2,
|
||||
AGX_BO_WRITEBACK, "Query");
|
||||
query->ptr = query->bo->ptr;
|
||||
}
|
||||
|
||||
if (!query->ptr.gpu) {
|
||||
free(query);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return (struct pipe_query *)query;
|
||||
}
|
||||
|
||||
static void
|
||||
sync_query_writers(struct agx_context *ctx, struct agx_query *query,
|
||||
const char *reason)
|
||||
{
|
||||
STATIC_ASSERT(ARRAY_SIZE(ctx->batches.generation) == AGX_MAX_BATCHES);
|
||||
STATIC_ASSERT(ARRAY_SIZE(ctx->batches.slots) == AGX_MAX_BATCHES);
|
||||
STATIC_ASSERT(ARRAY_SIZE(query->writer_generation) == AGX_MAX_BATCHES);
|
||||
|
||||
for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
|
||||
if (query->writer_generation[i] == ctx->batches.generation[i])
|
||||
agx_sync_batch_for_reason(ctx, &ctx->batches.slots[i], reason);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
is_query_busy(struct agx_context *ctx, struct agx_query *query)
|
||||
{
|
||||
for (unsigned i = 0; i < AGX_MAX_BATCHES; ++i) {
|
||||
if (query->writer_generation[i] == ctx->batches.generation[i])
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
agx_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery)
|
||||
{
|
||||
struct agx_context *ctx = agx_context(pctx);
|
||||
struct agx_query *query = (struct agx_query *)pquery;
|
||||
|
||||
/* It is legal for the query to be destroyed before its value is read,
|
||||
* particularly during application teardown. In this case, don't leave a
|
||||
* dangling reference to the query.
|
||||
/* We don't reference count the occlusion query allocations, so we need to
|
||||
* sync writers when destroying so we can freely write from the CPU after
|
||||
* it's destroyed, since the driver will assume an available query is idle.
|
||||
*
|
||||
* For other queries, the BO itself is reference counted after the pipe_query
|
||||
* is destroyed so we don't need to flush.
|
||||
*/
|
||||
if (query->writer) {
|
||||
assert(!is_timer(query) && "single writer not used here");
|
||||
|
||||
struct agx_batch *writer = query->writer;
|
||||
struct util_dynarray *array = is_occlusion(query)
|
||||
? &writer->occlusion_queries
|
||||
: &writer->nonocclusion_queries;
|
||||
struct agx_query **ptr =
|
||||
util_dynarray_element(array, struct agx_query *, query->writer_index);
|
||||
|
||||
assert((*ptr) == query && "data structure invariant");
|
||||
*ptr = NULL;
|
||||
} else if (is_timer(query)) {
|
||||
/* Potentially has many writers! We need them all to synchronize so they
|
||||
* don't have dangling references. Syncing will destroy batches that hold
|
||||
* references as required.
|
||||
*
|
||||
* TODO: Optimize this, timestamp queries are bonkers on tilers.
|
||||
*/
|
||||
agx_flush_all(ctx, "Destroying time query");
|
||||
agx_sync_all(ctx, "Destroying time query");
|
||||
if (is_occlusion(query)) {
|
||||
sync_query_writers(ctx, query, "Occlusion query destroy");
|
||||
agx_free_oq(ctx, query);
|
||||
} else {
|
||||
agx_bo_unreference(query->bo);
|
||||
}
|
||||
|
||||
free(query);
|
||||
free(pquery);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -118,9 +254,7 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
|
||||
|
||||
case PIPE_QUERY_TIME_ELAPSED:
|
||||
ctx->time_elapsed = query;
|
||||
query->timestamp_begin = UINT64_MAX;
|
||||
query->timestamp_end = 0;
|
||||
return true;
|
||||
break;
|
||||
|
||||
case PIPE_QUERY_TIMESTAMP:
|
||||
/* No-op */
|
||||
@@ -135,17 +269,17 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* begin_query zeroes, flush so we can do that write. If anything (i.e.
|
||||
* other than piglit) actually hits this, we could shadow the query to
|
||||
* avoid the flush.
|
||||
*/
|
||||
if (query->writer) {
|
||||
agx_flush_batch_for_reason(ctx, query->writer, "Query overwritten");
|
||||
agx_sync_batch_for_reason(ctx, query->writer, "Query overwrriten");
|
||||
/* begin_query zeroes, sync so we can do that write from the CPU */
|
||||
sync_query_writers(ctx, query, "Query overwritten");
|
||||
|
||||
uint64_t *ptr = query->ptr.cpu;
|
||||
ptr[0] = 0;
|
||||
|
||||
if (query->type == PIPE_QUERY_TIME_ELAPSED) {
|
||||
/* Timestamp begin in second record, the timestamp end in the first */
|
||||
ptr[1] = UINT64_MAX;
|
||||
}
|
||||
|
||||
assert(query->writer == NULL);
|
||||
query->value = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -183,15 +317,17 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
|
||||
assert(query->index < ARRAY_SIZE(ctx->pipeline_statistics));
|
||||
ctx->pipeline_statistics[query->index] = NULL;
|
||||
return true;
|
||||
case PIPE_QUERY_TIMESTAMP:
|
||||
case PIPE_QUERY_TIMESTAMP: {
|
||||
/* Timestamp logically written now, set up batches to MAX their finish
|
||||
* time in. If there are no batches, it's just the current time stamp.
|
||||
*/
|
||||
agx_add_timestamp_end_query(ctx, query);
|
||||
|
||||
query->timestamp_end = agx_get_gpu_timestamp(dev);
|
||||
uint64_t *value = query->ptr.cpu;
|
||||
*value = agx_get_gpu_timestamp(dev);
|
||||
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -205,56 +341,37 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
|
||||
struct agx_context *ctx = agx_context(pctx);
|
||||
struct agx_device *dev = agx_device(pctx->screen);
|
||||
|
||||
/* For GPU queries, flush the writer. When the writer is flushed, the GPU
|
||||
* will write the value, and when we wait for the writer, the CPU will read
|
||||
* the value into query->value.
|
||||
*/
|
||||
if (query->writer != NULL) {
|
||||
/* Querying the result forces a query to finish in finite time, so we
|
||||
* need to flush. Furthermore, we need all earlier queries
|
||||
* to finish before this query, so we sync unconditionally (so we can
|
||||
* maintain the lie that all queries are finished when read).
|
||||
*
|
||||
* TODO: Optimize based on wait flag.
|
||||
*/
|
||||
struct agx_batch *writer = query->writer;
|
||||
agx_flush_batch_for_reason(ctx, writer, "GPU query");
|
||||
agx_sync_batch_for_reason(ctx, writer, "GPU query");
|
||||
} else if (query->type == PIPE_QUERY_TIMESTAMP ||
|
||||
query->type == PIPE_QUERY_TIME_ELAPSED) {
|
||||
/* TODO: Optimize this... timestamp queries are bonkers on tilers. */
|
||||
agx_flush_all(ctx, "Timestamp query");
|
||||
agx_sync_all(ctx, "Timestamp query");
|
||||
}
|
||||
/* TODO: Honour `wait` */
|
||||
sync_query_writers(ctx, query, "Reading query results");
|
||||
|
||||
/* After syncing, there is no writer left, so query->value is ready */
|
||||
assert(query->writer == NULL && "cleared when cleaning up batch");
|
||||
uint64_t *ptr = query->ptr.cpu;
|
||||
uint64_t value = *ptr;
|
||||
|
||||
switch (query->type) {
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
vresult->b = query->value;
|
||||
vresult->b = value;
|
||||
return true;
|
||||
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
vresult->b = query->value > 0;
|
||||
vresult->b = value > 0;
|
||||
return true;
|
||||
|
||||
case PIPE_QUERY_OCCLUSION_COUNTER:
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
case PIPE_QUERY_PRIMITIVES_EMITTED:
|
||||
case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
|
||||
vresult->u64 = query->value;
|
||||
vresult->u64 = value;
|
||||
return true;
|
||||
|
||||
case PIPE_QUERY_TIMESTAMP:
|
||||
vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end);
|
||||
vresult->u64 = agx_gpu_time_to_ns(dev, value);
|
||||
return true;
|
||||
|
||||
case PIPE_QUERY_TIME_ELAPSED:
|
||||
vresult->u64 =
|
||||
agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin);
|
||||
/* end - begin */
|
||||
vresult->u64 = agx_gpu_time_to_ns(dev, ptr[0] - ptr[1]);
|
||||
return true;
|
||||
|
||||
default:
|
||||
@@ -272,26 +389,25 @@ agx_get_query_result_resource(struct pipe_context *pipe, struct pipe_query *q,
|
||||
|
||||
/* TODO: Don't cheat XXX */
|
||||
struct agx_context *ctx = agx_context(pipe);
|
||||
agx_sync_all(ctx, "Stubbed QBOs");
|
||||
|
||||
union pipe_query_result result;
|
||||
if (index < 0) {
|
||||
/* availability */
|
||||
result.u64 = 1;
|
||||
result.u64 = !is_query_busy(ctx, query);
|
||||
} else {
|
||||
bool ready = agx_get_query_result(pipe, q, true, &result);
|
||||
assert(ready);
|
||||
}
|
||||
|
||||
switch (query->type) {
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
result.u64 = result.b;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
switch (query->type) {
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE:
|
||||
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
result.u64 = result.b;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Clamp to type, arb_query_buffer_object-qbo tests */
|
||||
@@ -318,58 +434,36 @@ agx_set_active_query_state(struct pipe_context *pipe, bool enable)
|
||||
ctx->dirty |= AGX_DIRTY_QUERY;
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query,
|
||||
struct util_dynarray *array)
|
||||
static void
|
||||
agx_add_query_to_batch(struct agx_batch *batch, struct agx_query *query)
|
||||
{
|
||||
/* If written by another batch, flush it now. If this affects real apps, we
|
||||
* could avoid this flush by merging query results.
|
||||
*/
|
||||
if (query->writer && query->writer != batch) {
|
||||
agx_sync_batch_for_reason(batch->ctx, query->writer,
|
||||
"Multiple query writers");
|
||||
unsigned idx = agx_batch_idx(batch);
|
||||
struct agx_bo *bo = is_occlusion(query) ? batch->ctx->oq->bo : query->bo;
|
||||
|
||||
agx_batch_add_bo(batch, bo);
|
||||
query->writer_generation[idx] = batch->ctx->batches.generation[idx];
|
||||
}
|
||||
|
||||
void
|
||||
agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
|
||||
{
|
||||
if (q) {
|
||||
agx_add_query_to_batch(batch, q);
|
||||
util_dynarray_append(&batch->timestamps, struct agx_ptr, q->ptr);
|
||||
}
|
||||
|
||||
/* Allocate if needed */
|
||||
if (query->writer == NULL) {
|
||||
query->writer = batch;
|
||||
query->writer_index =
|
||||
util_dynarray_num_elements(array, struct agx_query *);
|
||||
|
||||
util_dynarray_append(array, struct agx_query *, query);
|
||||
}
|
||||
|
||||
assert(query->writer == batch);
|
||||
assert(*util_dynarray_element(array, struct agx_query *,
|
||||
query->writer_index) == query);
|
||||
|
||||
return query->writer_index;
|
||||
}
|
||||
|
||||
uint16_t
|
||||
agx_get_oq_index(struct agx_batch *batch, struct agx_query *query)
|
||||
{
|
||||
assert(is_occlusion(query));
|
||||
|
||||
return agx_add_query_to_batch(batch, query, &batch->occlusion_queries);
|
||||
agx_add_query_to_batch(batch, query);
|
||||
return agx_oq_index(batch->ctx, query);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
agx_get_query_address(struct agx_batch *batch, struct agx_query *query)
|
||||
{
|
||||
assert(!is_occlusion(query));
|
||||
|
||||
agx_add_query_to_batch(batch, query, &batch->nonocclusion_queries);
|
||||
|
||||
/* Allocate storage for the query in the batch */
|
||||
if (!query->ptr.cpu) {
|
||||
query->ptr = agx_pool_alloc_aligned(&batch->pool, sizeof(uint64_t),
|
||||
sizeof(uint64_t));
|
||||
|
||||
uint64_t *value = query->ptr.cpu;
|
||||
*value = 0;
|
||||
}
|
||||
|
||||
agx_add_query_to_batch(batch, query);
|
||||
return query->ptr.gpu;
|
||||
}
|
||||
|
||||
@@ -377,61 +471,31 @@ void
|
||||
agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
|
||||
uint64_t end_ts)
|
||||
{
|
||||
uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu;
|
||||
/* Remove the batch as write from all queries by incrementing the generation
|
||||
* of the batch.
|
||||
*/
|
||||
batch->ctx->batches.generation[agx_batch_idx(batch)]++;
|
||||
|
||||
util_dynarray_foreach(&batch->occlusion_queries, struct agx_query *, it) {
|
||||
struct agx_query *query = *it;
|
||||
/* Write out timestamps */
|
||||
util_dynarray_foreach(&batch->timestamps, struct agx_ptr, it) {
|
||||
uint64_t *ptr = it->cpu;
|
||||
|
||||
/* Skip queries that have since been destroyed */
|
||||
if (query == NULL)
|
||||
continue;
|
||||
|
||||
assert(query->writer == batch);
|
||||
|
||||
/* Get the result for this batch. If occlusion is NULL, it means that no
|
||||
* draws actually enabled any occlusion queries, so there's no change.
|
||||
*/
|
||||
if (occlusion != NULL) {
|
||||
uint64_t result = *(occlusion++);
|
||||
|
||||
/* Accumulate with the previous result (e.g. in case we split a frame
|
||||
* into multiple batches so an API-level query spans multiple batches).
|
||||
*/
|
||||
if (query->type == PIPE_QUERY_OCCLUSION_COUNTER)
|
||||
query->value += result;
|
||||
else
|
||||
query->value |= (!!result);
|
||||
}
|
||||
|
||||
query->writer = NULL;
|
||||
query->writer_index = 0;
|
||||
ptr[0] = MAX2(ptr[0], end_ts);
|
||||
ptr[1] = MIN2(ptr[1], begin_ts);
|
||||
}
|
||||
}
|
||||
|
||||
/* Now handle non-occlusion queries in a similar way */
|
||||
util_dynarray_foreach(&batch->nonocclusion_queries, struct agx_query *, it) {
|
||||
struct agx_query *query = *it;
|
||||
if (query == NULL)
|
||||
continue;
|
||||
void
|
||||
agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
|
||||
uint64_t increment)
|
||||
{
|
||||
if (!query)
|
||||
return;
|
||||
|
||||
assert(query->writer == batch);
|
||||
sync_query_writers(ctx, query, "CPU query increment");
|
||||
|
||||
/* Accumulate */
|
||||
uint64_t *value = query->ptr.cpu;
|
||||
query->value += (*value);
|
||||
query->writer = NULL;
|
||||
query->writer_index = 0;
|
||||
query->ptr.cpu = NULL;
|
||||
query->ptr.gpu = 0;
|
||||
}
|
||||
|
||||
util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) {
|
||||
struct agx_query *query = *it;
|
||||
if (query == NULL)
|
||||
continue;
|
||||
|
||||
query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts);
|
||||
query->timestamp_end = MAX2(query->timestamp_end, end_ts);
|
||||
}
|
||||
uint64_t *value = query->ptr.cpu;
|
||||
*value += increment;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@@ -3974,13 +3974,11 @@ agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
|
||||
|
||||
count *= info->instance_count;
|
||||
|
||||
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]) {
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]->value += count;
|
||||
}
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
|
||||
|
||||
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) {
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]->value += count;
|
||||
}
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
@@ -4676,10 +4674,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
|
||||
return;
|
||||
|
||||
/* TCS invocation counter increments once per-patch */
|
||||
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]) {
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]->value +=
|
||||
in_patches;
|
||||
}
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
|
||||
in_patches);
|
||||
|
||||
struct agx_batch *batch = agx_get_compute_batch(ctx);
|
||||
agx_batch_init_state(batch);
|
||||
@@ -4830,10 +4827,9 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
|
||||
desc[4] = 0; /* start_instance */
|
||||
|
||||
/* TES invocation counter increments once per tessellated vertex */
|
||||
if (ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]) {
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]->value +=
|
||||
data.num_domain_points;
|
||||
}
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
|
||||
data.num_domain_points);
|
||||
}
|
||||
p_tess_destroy(tess);
|
||||
|
||||
@@ -5419,7 +5415,8 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
|
||||
unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
|
||||
unsigned count = workgroups * blocksize;
|
||||
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]->value += count;
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count);
|
||||
}
|
||||
|
||||
struct agx_batch *batch = agx_get_compute_batch(ctx);
|
||||
|
@@ -372,15 +372,8 @@ struct agx_batch {
|
||||
/* Scissor and depth-bias descriptors, uploaded at GPU time */
|
||||
struct util_dynarray scissor, depth_bias;
|
||||
|
||||
/* Indexed occlusion queries within the occlusion buffer, and the occlusion
|
||||
* buffer itself which is allocated at submit time.
|
||||
*/
|
||||
struct util_dynarray occlusion_queries;
|
||||
struct agx_ptr occlusion_buffer;
|
||||
|
||||
/* Non-occlusion queries */
|
||||
struct util_dynarray nonocclusion_queries;
|
||||
struct util_dynarray timestamp_queries;
|
||||
/* Arrays of GPU pointers that should be written with the batch timestamps */
|
||||
struct util_dynarray timestamps;
|
||||
|
||||
/* Result buffer where the kernel places command execution information */
|
||||
union agx_batch_result *result;
|
||||
@@ -562,6 +555,8 @@ struct asahi_blitter {
|
||||
void *saved_cs;
|
||||
};
|
||||
|
||||
struct agx_oq_heap;
|
||||
|
||||
struct agx_context {
|
||||
struct pipe_context base;
|
||||
struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
|
||||
@@ -570,6 +565,9 @@ struct agx_context {
|
||||
/* Heap for dynamic memory allocation for geometry/tessellation shaders */
|
||||
struct pipe_resource *heap;
|
||||
|
||||
/* Occlusion query heap */
|
||||
struct agx_oq_heap *oq;
|
||||
|
||||
/* Acts as a context-level shader key */
|
||||
bool support_lod_bias;
|
||||
bool robust;
|
||||
@@ -586,6 +584,12 @@ struct agx_context {
|
||||
|
||||
/** Set of submitted batches for faster traversal */
|
||||
BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
|
||||
|
||||
/* Monotonic counter for each batch incremented when resetting a batch to
|
||||
* invalidate all associated queries. Compared to
|
||||
* agx_query::writer_generation.
|
||||
*/
|
||||
uint64_t generation[AGX_MAX_BATCHES];
|
||||
} batches;
|
||||
|
||||
struct agx_batch *batch;
|
||||
@@ -656,6 +660,12 @@ struct agx_context {
|
||||
struct agx_scratch scratch_cs;
|
||||
};
|
||||
|
||||
static inline unsigned
|
||||
agx_batch_idx(struct agx_batch *batch)
|
||||
{
|
||||
return batch - batch->ctx->batches.slots;
|
||||
}
|
||||
|
||||
static void
|
||||
agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
|
||||
{
|
||||
@@ -759,30 +769,9 @@ struct agx_query {
|
||||
unsigned type;
|
||||
unsigned index;
|
||||
|
||||
/* Invariant for occlusion queries:
|
||||
*
|
||||
* writer != NULL => writer->occlusion_queries[writer_index] == this, and
|
||||
* writer == NULL => no batch such that this in batch->occlusion_queries
|
||||
*/
|
||||
struct agx_batch *writer;
|
||||
unsigned writer_index;
|
||||
|
||||
/* For GPU queries other than occlusion queries, the value of the query as
|
||||
* written by the `writer` if a writer is non-NULL, and irrelevant otherwise.
|
||||
* When flushing the query, this value is read and added to agx_query::value.
|
||||
*/
|
||||
uint64_t writer_generation[AGX_MAX_BATCHES];
|
||||
struct agx_bo *bo;
|
||||
struct agx_ptr ptr;
|
||||
|
||||
/* Accumulator flushed to the CPU */
|
||||
union {
|
||||
uint64_t value;
|
||||
uint64_t timestamp_end;
|
||||
};
|
||||
|
||||
/* For time elapsed queries, end is in the above union for consistent
|
||||
* handling witn timestamp queries.
|
||||
*/
|
||||
uint64_t timestamp_begin;
|
||||
};
|
||||
|
||||
struct agx_sampler_state {
|
||||
@@ -1046,6 +1035,9 @@ void agx_batch_add_timestamp_query(struct agx_batch *batch,
|
||||
struct agx_query *q);
|
||||
void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
|
||||
|
||||
void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
|
||||
uint64_t increment);
|
||||
|
||||
/* Blit shaders */
|
||||
void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
|
||||
bool render_cond);
|
||||
@@ -1072,6 +1064,7 @@ uint64_t agx_build_meta(struct agx_batch *batch, bool store,
|
||||
uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
|
||||
uint64_t agx_get_query_address(struct agx_batch *batch,
|
||||
struct agx_query *query);
|
||||
uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
|
||||
|
||||
void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
|
||||
uint64_t end_ts);
|
||||
|
@@ -171,8 +171,8 @@ agx_primitives_update_direct(struct agx_context *ctx,
|
||||
assert(!ctx->stage[PIPE_SHADER_GEOMETRY].shader &&
|
||||
"Geometry shaders use their own counting");
|
||||
|
||||
ctx->prims_generated[0]->value +=
|
||||
xfb_prims_for_vertices(info->mode, draw->count);
|
||||
agx_query_increment_cpu(ctx, ctx->prims_generated[0],
|
||||
xfb_prims_for_vertices(info->mode, draw->count));
|
||||
}
|
||||
|
||||
void
|
||||
|
Reference in New Issue
Block a user