From 58d47ca32447fdf7762be22efd2394b9f399f0f8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 2 Mar 2021 00:18:07 -0500
Subject: [PATCH] nv50: add compute invocations counter

This is a purely software counter alongside the other hardware counters
for ease of use and consistency. However we have to make room for it in
the allocated query space. Use this opportunity to make the nv50 queries
work like the nvc0 ones in terms of space allocation.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Pierre Moreau <dev@pmoreau.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10164>
---
 .../drivers/nouveau/nv50/nv50_compute.c       |  3 ++
 .../drivers/nouveau/nv50/nv50_context.h       |  2 ++
 .../drivers/nouveau/nv50/nv50_query_hw.c      | 34 +++++++++++++------
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 5c1b27f9beb..b3ceaf79971 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -628,4 +628,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 
    /* bind a compute shader clobbers fragment shader state */
    nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
+
+   nv50->compute_invocations += info->block[0] * info->block[1] * info->block[2] *
+      info->grid[0] * info->grid[1] * info->grid[2];
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index afd04d99ba6..af8a290db71 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -225,6 +225,8 @@ struct nv50_context {
    uint16_t images_valid;
 
    struct util_dynarray global_residents;
+
+   uint64_t compute_invocations;
 };
 
 static inline struct nv50_context *
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index d91f46cf92b..50000ffff4c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -174,14 +174,15 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
       nv50_hw_query_get(push, q, 0x30, 0x06805002);
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_hw_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
-      nv50_hw_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
-      nv50_hw_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
-      nv50_hw_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
-      nv50_hw_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_hw_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_hw_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_hw_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
+      nv50_hw_query_get(push, q, 0x90, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0xa0, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0xb0, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xc0, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xd0, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xe0, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0xf0, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x100, 0x0980a002); /* ROP, PIXELS */
+      ((uint64_t *)hq->data)[2 * 0x11] = nv50->compute_invocations;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
       nv50_hw_query_get(push, q, 0x10, 0x00005002);
@@ -237,6 +238,7 @@ nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
       nv50_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
       nv50_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
       nv50_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      ((uint64_t *)hq->data)[2 * 0x8] = nv50->compute_invocations;
       break;
    case PIPE_QUERY_TIMESTAMP:
       hq->sequence++;
@@ -316,7 +318,8 @@ nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       for (i = 0; i < 8; ++i)
-         res64[i] = data64[i * 2] - data64[16 + i * 2];
+         res64[i] = data64[i * 2] - data64[18 + i * 2];
+      result->pipeline_statistics.cs_invocations = data64[i * 2] - data64[18 + i * 2];
       break;
    case PIPE_QUERY_TIMESTAMP:
       res64[0] = data64[1];
@@ -351,6 +354,7 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
 {
    struct nv50_hw_query *hq;
    struct nv50_query *q;
+   unsigned space = NV50_HW_QUERY_ALLOC_SPACE;
 
    hq = nv50_hw_sm_create_query(nv50, type);
    if (hq) {
@@ -380,15 +384,25 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
    case PIPE_QUERY_PRIMITIVES_EMITTED:
+      hq->is64bit = true;
+      space = 32;
+      break;
    case PIPE_QUERY_SO_STATISTICS:
+      hq->is64bit = true;
+      space = 64;
+      break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       hq->is64bit = true;
+      space = 9 * 2 * 16; /* 9 values, start/end, 16-bytes each */
       break;
    case PIPE_QUERY_TIME_ELAPSED:
    case PIPE_QUERY_TIMESTAMP:
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
    case PIPE_QUERY_GPU_FINISHED:
+      space = 32;
+      break;
    case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      space = 16;
       break;
    default:
       debug_printf("invalid query type: %u\n", type);
@@ -396,7 +410,7 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
       return NULL;
    }
 
-   if (!nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE)) {
+   if (!nv50_hw_query_allocate(nv50, q, space)) {
       FREE(hq);
       return NULL;
    }