nvk: rewrite query copy shader in CL C

as previously discussed. this is using "library CL" instead of kernel CL, which is the older way of doing things. it works, it just has more boilerplate per-kernel than we'd want otherwise. but library CL is basically free to integrate into a driver, whereas kernel CL requires a lot more upfront investment. (I'm working on cleaning that up but we're not quite there yet.) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33362>
2025-02-03 14:02:19 -05:00
parent c249556bf4
commit feedd427b3
5 changed files with 97 additions and 161 deletions
--- a/meson.build
+++ b/meson.build
@@ -834,6 +834,7 @@ with_driver_using_cl = [
  with_gallium_iris, with_intel_vk,
  with_gallium_asahi, with_asahi_vk,
  with_gallium_panfrost, with_panfrost_vk,
+  with_nouveau_vk,
 ].contains(true)

 if get_option('mesa-clc') == 'system'
--- a/src/nouveau/vulkan/cl/nvk_query.cl
+++ b/src/nouveau/vulkan/cl/nvk_query.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#include "compiler/libcl/libcl_vk.h"
+#include "nvk_query.h"
+
+void
+nvk_copy_queries(uint64_t pool_addr, uint query_start, uint query_stride,
+                 uint first_query, uint query_count, uint64_t dst_addr,
+                 uint64_t dst_stride, uint flags)
+{
+   uint i = get_sub_group_local_id() + cl_group_id.x * 32;
+   if (i >= query_count)
+      return;
+
+   uint query = first_query + i;
+   global uint *available_arr = (global uint *)pool_addr;
+   bool available = available_arr[query] != 0;
+   bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
+
+   uint64_t report_offs = query_start + (uint64_t)query * (uint64_t)query_stride;
+   global struct nvk_query_report *report =
+      (global void *)(pool_addr + report_offs);
+
+   uint64_t dst_offset = dst_stride * (uint64_t)i;
+   uint num_reports = 1;
+
+   if (query_stride == sizeof(struct nvk_query_report)) {
+      /* Timestamp queries are the only ones use a single report */
+      if (write_results) {
+         vk_write_query(dst_addr + dst_offset, 0, flags, report->timestamp);
+      }
+   } else {
+      /* Everything that isn't a timestamp has the invariant that the
+       * number of destination entries is equal to the query stride divided
+       * by the size of two reports.
+       */
+      num_reports = query_stride / (2 * sizeof(struct nvk_query_report));
+
+      if (write_results) {
+         for (uint r = 0; r < num_reports; ++r) {
+            uint delta = report[(r * 2) + 1].value - report[r * 2].value;
+
+            vk_write_query(dst_addr + dst_offset, r, flags, delta);
+         }
+      }
+   }
+
+   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
+      vk_write_query(dst_addr + dst_offset, num_reports, flags, available);
+   }
+}
--- a/src/nouveau/vulkan/cl/nvk_query.h
+++ b/src/nouveau/vulkan/cl/nvk_query.h
@@ -0,0 +1,12 @@
+/*
+ * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
+ * SPDX-License-Identifier: MIT
+ */
+#pragma once
+
+#include "compiler/libcl/libcl.h"
+
+struct nvk_query_report {
+   uint64_t value;
+   uint64_t timestamp;
+};
--- a/src/nouveau/vulkan/meson.build
+++ b/src/nouveau/vulkan/meson.build
@@ -75,6 +75,10 @@ nvk_files = files(
  'nvkmd/nvkmd.h',
 )

+nvkcl_files = files(
+  'cl/nvk_query.cl',
+)
+
 nvk_entrypoints = custom_target(
  'nvk_entrypoints',
  input : [vk_entrypoints_gen, vk_api_xml],
@@ -87,6 +91,26 @@ nvk_entrypoints = custom_target(
  depend_files : vk_entrypoints_gen_depend_files,
 )

+relative_dir = fs.relative_to(meson.global_source_root(), meson.global_build_root())
+
+nvkcl_spv = custom_target(
+  input : nvkcl_files,
+  output : 'nvkcl.spv',
+  command : [
+    prog_mesa_clc, '-o', '@OUTPUT@', '--depfile', '@DEPFILE@', nvkcl_files, '--',
+    '-I' + join_paths(meson.current_source_dir(), '.'),
+    '-I' + join_paths(meson.project_source_root(), 'src'),
+    '-fmacro-prefix-map=@0@/='.format(relative_dir),
+    ],
+  depfile : '@PLAINNAME@.d',
+)
+
+nvkcl = custom_target(
+  input : nvkcl_spv,
+  output : ['nvkcl.cpp', 'nvkcl.h'],
+  command : [prog_vtn_bindgen2, '@INPUT@', '@OUTPUT@'],
+)
+
 nvk_deps = [
  dep_libdrm,
  idep_nak,
@@ -116,6 +140,7 @@ libnvk = static_library(
    nvk_entrypoints,
    cl_generated,
    nvk_files,
+    nvkcl,
    sha1_h,
  ],
  include_directories : [
--- a/src/nouveau/vulkan/nvk_query_pool.c
+++ b/src/nouveau/vulkan/nvk_query_pool.c
@@ -17,8 +17,10 @@
 #include "vk_meta.h"
 #include "vk_pipeline.h"

+#include "cl/nvk_query.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
+#include "nvkcl.h"

 #include "util/os_time.h"

@@ -27,11 +29,6 @@
 #include "nv_push_cla0c0.h"
 #include "nv_push_clc597.h"

-struct nvk_query_report {
-   uint64_t value;
-   uint64_t timestamp;
-};
-
 VKAPI_ATTR VkResult VKAPI_CALL
 nvk_CreateQueryPool(VkDevice device,
                    const VkQueryPoolCreateInfo *pCreateInfo,
@@ -118,14 +115,6 @@ nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query)
   return pool->mem->va->addr + query * sizeof(uint32_t);
 }

-static nir_def *
-nvk_nir_available_addr(nir_builder *b, nir_def *pool_addr,
-                       nir_def *query)
-{
-   nir_def *offset = nir_imul_imm(b, query, sizeof(uint32_t));
-   return nir_iadd(b, pool_addr, nir_u2u64(b, offset));
-}
-
 static uint32_t *
 nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query)
 {
@@ -146,16 +135,6 @@ nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query)
   return pool->mem->va->addr + nvk_query_offset(pool, query);
 }

-static nir_def *
-nvk_nir_query_report_addr(nir_builder *b, nir_def *pool_addr,
-                          nir_def *query_start, nir_def *query_stride,
-                          nir_def *query)
-{
-   nir_def *offset =
-      nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride));
-   return nir_iadd(b, pool_addr, offset);
-}
-
 static struct nvk_query_report *
 nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query)
 {
@@ -715,135 +694,6 @@ load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
   return nir_load_deref(b, deref);
 }

-static void
-nir_write_query_result(nir_builder *b, nir_def *dst_addr,
-                       nir_def *idx, nir_def *flags,
-                       nir_def *result)
-{
-   assert(result->num_components == 1);
-   assert(result->bit_size == 64);
-
-   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
-   {
-      nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8));
-      nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1);
-   }
-   nir_push_else(b, NULL);
-   {
-      nir_def *result32 = nir_u2u32(b, result);
-      nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4));
-      nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1);
-   }
-   nir_pop_if(b, NULL);
-}
-
-static void
-nir_get_query_delta(nir_builder *b, nir_def *dst_addr,
-                    nir_def *report_addr, nir_def *idx,
-                    nir_def *flags)
-{
-   nir_def *offset =
-      nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report));
-   nir_def *begin_addr =
-      nir_iadd(b, report_addr, nir_i2i64(b, offset));
-   nir_def *end_addr =
-      nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report));
-
-   /* nvk_query_report::timestamp is the first uint64_t */
-   nir_def *begin = nir_load_global(b, begin_addr, 16, 1, 64);
-   nir_def *end = nir_load_global(b, end_addr, 16, 1, 64);
-
-   nir_def *delta = nir_isub(b, end, begin);
-
-   nir_write_query_result(b, dst_addr, idx, flags, delta);
-}
-
-static void
-nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_def *i)
-{
-   nir_def *pool_addr = load_struct_var(b, push, 0);
-   nir_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1));
-   nir_def *query_stride = load_struct_var(b, push, 2);
-   nir_def *first_query = load_struct_var(b, push, 3);
-   nir_def *dst_addr = load_struct_var(b, push, 5);
-   nir_def *dst_stride = load_struct_var(b, push, 6);
-   nir_def *flags = load_struct_var(b, push, 7);
-
-   nir_def *query = nir_iadd(b, first_query, i);
-
-   nir_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query);
-   nir_def *available =
-      nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32));
-
-   nir_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT);
-   nir_def *write_results = nir_ior(b, available, partial);
-
-   nir_def *report_addr =
-      nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride,
-                                query);
-   nir_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride);
-
-   /* Timestamp queries are the only ones use a single report */
-   nir_def *is_timestamp =
-      nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report));
-
-   nir_def *one = nir_imm_int(b, 1);
-   nir_def *num_reports;
-   nir_push_if(b, is_timestamp);
-   {
-      nir_push_if(b, write_results);
-      {
-         /* This is the timestamp case.  We add 8 because we're loading
-          * nvk_query_report::timestamp.
-          */
-         nir_def *timestamp =
-            nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64);
-
-         nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
-                                nir_imm_int(b, 0), flags, timestamp);
-      }
-      nir_pop_if(b, NULL);
-   }
-   nir_push_else(b, NULL);
-   {
-      /* Everything that isn't a timestamp has the invariant that the
-       * number of destination entries is equal to the query stride divided
-       * by the size of two reports.
-       */
-      num_reports = nir_udiv_imm(b, query_stride,
-                                 2 * sizeof(struct nvk_query_report));
-
-      nir_push_if(b, write_results);
-      {
-         nir_variable *r =
-            nir_local_variable_create(b->impl, glsl_uint_type(), "r");
-         nir_store_var(b, r, nir_imm_int(b, 0), 0x1);
-
-         nir_push_loop(b);
-         {
-            nir_break_if(b, nir_ige(b, nir_load_var(b, r), num_reports));
-
-            nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset),
-                                report_addr, nir_load_var(b, r), flags);
-
-            nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1);
-         }
-         nir_pop_loop(b, NULL);
-      }
-      nir_pop_if(b, NULL);
-   }
-   nir_pop_if(b, NULL);
-
-   num_reports = nir_if_phi(b, one, num_reports);
-
-   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
-   {
-      nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
-                             num_reports, flags, nir_b2i64(b, available));
-   }
-   nir_pop_if(b, NULL);
-}
-
 static nir_shader *
 build_copy_queries_shader(void)
 {
@@ -870,16 +720,11 @@ build_copy_queries_shader(void)
                                            push_iface_type, "push");

   b->shader->info.workgroup_size[0] = 32;
-   nir_def *wg_id = nir_load_workgroup_id(b);
-   nir_def *i = nir_iadd(b, nir_load_subgroup_invocation(b),
-                            nir_imul_imm(b, nir_channel(b, wg_id, 0), 32));

-   nir_def *query_count = load_struct_var(b, push, 4);
-   nir_push_if(b, nir_ilt(b, i, query_count));
-   {
-      nvk_nir_copy_query(b, push, i);
-   }
-   nir_pop_if(b, NULL);
+   nvk_copy_queries(b, load_struct_var(b, push, 0), load_struct_var(b, push, 1),
+                    load_struct_var(b, push, 2), load_struct_var(b, push, 3),
+                    load_struct_var(b, push, 4), load_struct_var(b, push, 5),
+                    load_struct_var(b, push, 6), load_struct_var(b, push, 7));

   return build.shader;
 }