From cd7128c2e305eb7673dcd1c35baf4f3e41d39d35 Mon Sep 17 00:00:00 2001
From: Faith Ekstrand <faith.ekstrand@collabora.com>
Date: Thu, 18 Jul 2024 11:20:28 -0500
Subject: [PATCH] nak: Add a bare HW shader runner

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30275>
---
 src/nouveau/compiler/meson.build       |  31 +-
 src/nouveau/compiler/nak_bindings.h    |  12 +
 src/nouveau/compiler/nak_runner/lib.rs | 422 +++++++++++++++++++++++++
 src/nouveau/meson.build                |   2 +-
 4 files changed, 464 insertions(+), 3 deletions(-)
 create mode 100644 src/nouveau/compiler/nak_runner/lib.rs

diff --git a/src/nouveau/compiler/meson.build b/src/nouveau/compiler/meson.build
index d82b341985e..0d8e67d7255 100644
--- a/src/nouveau/compiler/meson.build
+++ b/src/nouveau/compiler/meson.build
@@ -94,6 +94,7 @@ _nak_bindings_rs = rust.bindgen(
     '--raw-line', '#![allow(non_camel_case_types)]',
     '--raw-line', '#![allow(non_snake_case)]',
     '--raw-line', '#![allow(non_upper_case_globals)]',
+    '--allowlist-type', 'drm.*',
     '--allowlist-type', 'exec_list',
     '--allowlist-type', 'exec_node',
     '--allowlist-type', 'float_controls',
@@ -107,17 +108,26 @@ _nak_bindings_rs = rust.bindgen(
     '--allowlist-type', 'gl_vert_attrib',
     '--allowlist-type', 'nak_.*',
     '--allowlist-type', 'nir_.*',
+    '--allowlist-type', 'nouveau_ws_.*',
     '--allowlist-type', 'mesa_scope',
     '--allowlist-type', 'mesa_prim',
     '--allowlist-type', 'tess_primitive_mode',
+    '--allowlist-var', 'DRM_.*',
     '--allowlist-var', 'nir_.*_infos',
+    '--allowlist-var', 'NVIDIA_VENDOR_ID',
+    '--allowlist-function', 'drm.*',
+    '--allowlist-function', 'glsl_.*',
     '--allowlist-function', '_mesa_shader_stage_to_string',
     '--allowlist-function', 'nak_.*',
     '--allowlist-function', 'nir_.*',
-    '--allowlist-function', 'glsl_.*',
+    '--allowlist-function', 'nouveau_ws_.*',
     '--no-prepend-enum-name',
   ],
-  dependencies : libnak_deps,
+  dependencies : [
+    dep_libdrm,
+    idep_nouveau_ws,
+    libnak_deps,
+  ],
 )
 
 _libnak_bindings_rs = static_library(
@@ -164,6 +174,23 @@ _libnak_rs = static_library(
 )
 
 if with_tests
+  _libnak_runner = static_library(
+    'nak_runner',
+    files('nak_runner/lib.rs'),
+    gnu_symbol_visibility : 'hidden',
+    rust_abi : 'rust',
+    rust_args : nak_rust_args,
+    dependencies : [
+      dep_libdrm,
+      idep_nvidia_headers_rs,
+      idep_nv_push_rs,
+    ],
+    link_with: [
+      _libnak_bindings_rs,
+      _libnak_qmd_rs,
+    ],
+  )
+
   rust.test('nak', _libnak_rs, suite : ['nouveau'])
 endif
 
diff --git a/src/nouveau/compiler/nak_bindings.h b/src/nouveau/compiler/nak_bindings.h
index fa20b29bcb5..9af1aecef3c 100644
--- a/src/nouveau/compiler/nak_bindings.h
+++ b/src/nouveau/compiler/nak_bindings.h
@@ -4,3 +4,15 @@
  */
 
 #include "nak_private.h"
+
+#include "nouveau_bo.h"
+#include "nouveau_context.h"
+#include "nouveau_device.h"
+
+#include <xf86drm.h>
+#include "drm-uapi/nouveau_drm.h"
+
+#define DRM_RS_IOCTL(FOO) \
+   static const unsigned long DRM_RS_IOCTL_##FOO = DRM_IOCTL_##FOO
+
+DRM_RS_IOCTL(NOUVEAU_EXEC);
diff --git a/src/nouveau/compiler/nak_runner/lib.rs b/src/nouveau/compiler/nak_runner/lib.rs
new file mode 100644
index 00000000000..7bc3263bf6d
--- /dev/null
+++ b/src/nouveau/compiler/nak_runner/lib.rs
@@ -0,0 +1,422 @@
+// Copyright © 2022 Collabora, Ltd.
+// SPDX-License-Identifier: MIT
+
+use nak_bindings::*;
+use nv_push_rs::Push as NvPush;
+use nvidia_headers::classes::cla0c0::mthd as cla0c0;
+use nvidia_headers::classes::clb1c0::mthd as clb1c0;
+use nvidia_headers::classes::clb1c0::MAXWELL_COMPUTE_B;
+use nvidia_headers::classes::clc3c0::mthd as clc3c0;
+use nvidia_headers::classes::clc3c0::VOLTA_COMPUTE_A;
+use nvidia_headers::classes::clc6c0::mthd as clc6c0;
+use nvidia_headers::classes::clc6c0::AMPERE_COMPUTE_A;
+
+use std::io;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Mutex;
+
+unsafe fn is_nvidia_device(dev: drmDevicePtr) -> bool {
+    match (*dev).bustype as u32 {
+        DRM_BUS_PCI => {
+            let pci = &*(*dev).deviceinfo.pci;
+            pci.vendor_id == (NVIDIA_VENDOR_ID as u16)
+        }
+        _ => false,
+    }
+}
+
+#[repr(C)]
+pub struct CB0 {
+    pub data_addr_lo: u32,
+    pub data_addr_hi: u32,
+    pub data_stride: u32,
+    pub invocations: u32,
+}
+
+struct BO<'a> {
+    run: &'a Runner,
+    bo: NonNull<nouveau_ws_bo>,
+    pub addr: u64,
+    pub map: *mut std::os::raw::c_void,
+}
+
+impl<'a> BO<'a> {
+    fn new(run: &'a Runner, size: u64) -> io::Result<BO<'a>> {
+        let size = size.next_multiple_of(4096);
+
+        let mut map: *mut std::os::raw::c_void = std::ptr::null_mut();
+        let bo = unsafe {
+            nouveau_ws_bo_new_mapped(
+                run.dev.as_ptr(),
+                size,
+                0, // align
+                NOUVEAU_WS_BO_GART,
+                NOUVEAU_WS_BO_RDWR,
+                &mut map as *mut _,
+            )
+        };
+        let Some(bo) = NonNull::new(bo) else {
+            return Err(io::Error::last_os_error());
+        };
+        assert!(!map.is_null());
+
+        let addr = run.next_addr.fetch_add(size, Ordering::Relaxed);
+        assert!(addr % 4096 == 0);
+
+        unsafe {
+            nouveau_ws_bo_bind_vma(
+                run.dev.as_ptr(),
+                bo.as_ptr(),
+                addr,
+                size,
+                0, // bo_offset
+                0, // pte_kind
+            );
+        }
+
+        Ok(BO { run, bo, addr, map })
+    }
+}
+
+impl Drop for BO<'_> {
+    fn drop(&mut self) {
+        unsafe {
+            nouveau_ws_bo_unbind_vma(
+                self.run.dev.as_ptr(),
+                self.addr,
+                self.bo.as_ref().size,
+            );
+            nouveau_ws_bo_destroy(self.bo.as_ptr());
+        }
+    }
+}
+
+pub struct Runner {
+    dev: NonNull<nouveau_ws_device>,
+    ctx: NonNull<nouveau_ws_context>,
+    syncobj: u32,
+    sync_value: Mutex<u64>,
+    next_addr: AtomicU64,
+}
+
+impl<'a> Runner {
+    pub fn new(dev_id: Option<usize>) -> Runner {
+        unsafe {
+            let mut drm_devices: [drmDevicePtr; 16] = std::mem::zeroed();
+            let num_drm_devices = drmGetDevices(
+                drm_devices.as_mut_ptr(),
+                drm_devices.len().try_into().unwrap(),
+            );
+
+            assert!(num_drm_devices >= 0, "Failed to enumerate DRM devices");
+            let num_drm_devices: usize = num_drm_devices.try_into().unwrap();
+
+            let drm_dev = if let Some(dev_id) = dev_id {
+                assert!(dev_id < num_drm_devices, "Unknown device {dev_id}");
+                assert!(
+                    is_nvidia_device(drm_devices[dev_id]),
+                    "Device {dev_id} is not an NVIDIA device",
+                );
+                drm_devices[dev_id]
+            } else {
+                *drm_devices
+                    .iter()
+                    .find(|dev| is_nvidia_device(**dev))
+                    .expect("Failed to find an NVIDIA device")
+            };
+
+            let dev = nouveau_ws_device_new(drm_dev);
+            let dev =
+                NonNull::new(dev).expect("Failed to create nouveau device");
+
+            drmFreeDevices(
+                drm_devices.as_mut_ptr(),
+                num_drm_devices.try_into().unwrap(),
+            );
+
+            let mut ctx: *mut nouveau_ws_context = std::ptr::null_mut();
+            let err = nouveau_ws_context_create(
+                dev.as_ptr(),
+                NOUVEAU_WS_ENGINE_COMPUTE,
+                &mut ctx,
+            );
+            assert!(err == 0, "Failed to create nouveau context");
+            let ctx = NonNull::new(ctx).unwrap();
+
+            let mut syncobj = 0_u32;
+            let err = drmSyncobjCreate(dev.as_ref().fd, 0, &mut syncobj);
+            assert!(err == 0, "Failed to create syncobj");
+
+            Runner {
+                dev,
+                ctx,
+                syncobj,
+                sync_value: Mutex::new(0),
+                next_addr: AtomicU64::new(1 << 16),
+            }
+        }
+    }
+
+    pub fn dev_info(&self) -> &nv_device_info {
+        unsafe { &self.dev.as_ref().info }
+    }
+
+    fn exec(&self, addr: u64, len: u16) -> io::Result<()> {
+        let sync_value = unsafe {
+            let mut sync_value = self.sync_value.lock().unwrap();
+            *sync_value += 1;
+
+            let push = drm_nouveau_exec_push {
+                va: addr,
+                va_len: len.into(),
+                flags: 0,
+            };
+            let sig = drm_nouveau_sync {
+                flags: DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ,
+                handle: self.syncobj,
+                timeline_value: *sync_value,
+            };
+            let exec = drm_nouveau_exec {
+                channel: self.ctx.as_ref().channel as u32,
+                wait_count: 0,
+                wait_ptr: 0,
+                push_count: 1,
+                push_ptr: &push as *const _ as u64,
+                sig_count: 1,
+                sig_ptr: &sig as *const _ as u64,
+            };
+            let err = drmIoctl(
+                self.dev.as_ref().fd,
+                DRM_RS_IOCTL_NOUVEAU_EXEC,
+                &exec as *const _ as *mut std::os::raw::c_void,
+            );
+            if err != 0 {
+                return Err(io::Error::last_os_error());
+            }
+            *sync_value
+        };
+        // The close of this unsafe { } drops the lock
+
+        unsafe {
+            let err = drmSyncobjTimelineWait(
+                self.dev.as_ref().fd,
+                &self.syncobj as *const _ as *mut _,
+                &sync_value as *const _ as *mut _,
+                1,        // num_handles
+                i64::MAX, // timeout_nsec
+                DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
+                std::ptr::null_mut(),
+            );
+            if err != 0 {
+                return Err(io::Error::last_os_error());
+            }
+
+            // Exec again to check for errors
+            let exec = drm_nouveau_exec {
+                channel: self.ctx.as_ref().channel as u32,
+                wait_count: 0,
+                wait_ptr: 0,
+                push_count: 0,
+                push_ptr: 0,
+                sig_count: 0,
+                sig_ptr: 0,
+            };
+            let err = drmIoctl(
+                self.dev.as_ref().fd,
+                DRM_RS_IOCTL_NOUVEAU_EXEC,
+                &exec as *const _ as *mut std::os::raw::c_void,
+            );
+            if err != 0 {
+                return Err(io::Error::last_os_error());
+            }
+        }
+
+        Ok(())
+    }
+
+    pub unsafe fn run_raw(
+        &self,
+        shader: &nak_shader_bin,
+        invocations: u32,
+        data_stride: u32,
+        data: *mut std::os::raw::c_void,
+        data_size: usize,
+    ) -> io::Result<()> {
+        assert!(shader.info.stage == MESA_SHADER_COMPUTE);
+        let cs_info = &shader.info.__bindgen_anon_1.cs;
+        assert!(cs_info.local_size[1] == 1 && cs_info.local_size[2] == 1);
+        let local_size = cs_info.local_size[0];
+
+        // Compute the needed size of the buffer
+        let mut size = 0_usize;
+
+        const MAX_PUSH_DW: usize = 256;
+        let push_offset = size;
+        size = push_offset + 4 * MAX_PUSH_DW;
+
+        const QMD_SIZE: usize = 64 * 4;
+        let qmd_offset = size.next_multiple_of(0x100);
+        size = qmd_offset + 4 * QMD_SIZE;
+
+        let shader_offset = size.next_multiple_of(0x80);
+        size = shader_offset + usize::try_from(shader.code_size).unwrap();
+
+        let cb0_offset = size.next_multiple_of(256);
+        size = cb0_offset + std::mem::size_of::<CB0>();
+
+        let data_offset = size.next_multiple_of(256);
+        size = data_offset + data_size;
+
+        let bo = BO::new(self, size.try_into().unwrap())?;
+
+        // Copy the data from the caller into our BO
+        let data_addr = bo.addr + u64::try_from(data_offset).unwrap();
+        let data_map = bo.map.offset(data_offset.try_into().unwrap());
+        std::ptr::copy(data, data_map, data_size);
+
+        // Fill out cb0
+        let cb0_addr = bo.addr + u64::try_from(cb0_offset).unwrap();
+        let cb0_map = bo.map.offset(cb0_offset.try_into().unwrap());
+        (cb0_map as *mut CB0).write(CB0 {
+            data_addr_lo: data_addr as u32,
+            data_addr_hi: (data_addr >> 32) as u32,
+            data_stride,
+            invocations,
+        });
+
+        // Upload the shader
+        let shader_addr = bo.addr + u64::try_from(shader_offset).unwrap();
+        let shader_map = bo.map.offset(shader_offset.try_into().unwrap());
+        std::ptr::copy(
+            shader.code,
+            shader_map,
+            shader.code_size.try_into().unwrap(),
+        );
+
+        // Populate and upload the QMD
+        let mut qmd_cbufs: [nak_qmd_cbuf; 8] = unsafe { std::mem::zeroed() };
+        qmd_cbufs[0] = nak_qmd_cbuf {
+            index: 0,
+            size: std::mem::size_of::<CB0>()
+                .next_multiple_of(256)
+                .try_into()
+                .unwrap(),
+            addr: cb0_addr,
+        };
+        let qmd_info = nak_qmd_info {
+            // Pre-Volta, we set the program region to the start of the bo
+            addr: if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
+                shader_offset.try_into().unwrap()
+            } else {
+                shader_addr
+            },
+            smem_size: 0,
+            smem_max: 48 * 1024,
+            global_size: [invocations.div_ceil(local_size.into()), 1, 1],
+            num_cbufs: 1,
+            cbufs: qmd_cbufs,
+        };
+
+        let qmd_addr = bo.addr + u64::try_from(qmd_offset).unwrap();
+        let qmd_map = bo.map.offset(qmd_offset.try_into().unwrap());
+        nak_fill_qmd(
+            self.dev_info(),
+            &shader.info,
+            &qmd_info,
+            qmd_map,
+            QMD_SIZE,
+        );
+
+        // Fill out the pushbuf
+        let mut p = NvPush::new();
+
+        p.push_method(cla0c0::SetObject {
+            class_id: self.dev_info().cls_compute.into(),
+            engine_id: 0,
+        });
+        if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
+            p.push_method(cla0c0::SetProgramRegionA {
+                address_upper: (bo.addr >> 32) as u32,
+            });
+            p.push_method(cla0c0::SetProgramRegionB {
+                address_lower: bo.addr as u32,
+            });
+        }
+
+        let smem_base_addr = 0xfe000000_u32;
+        let lmem_base_addr = 0xff000000_u32;
+        if self.dev_info().cls_compute >= VOLTA_COMPUTE_A {
+            p.push_method(clc3c0::SetShaderSharedMemoryWindowA {
+                base_address_upper: 0,
+            });
+            p.push_method(clc3c0::SetShaderSharedMemoryWindowB {
+                base_address: smem_base_addr,
+            });
+
+            p.push_method(clc3c0::SetShaderLocalMemoryWindowA {
+                base_address_upper: 0,
+            });
+            p.push_method(clc3c0::SetShaderLocalMemoryWindowB {
+                base_address: lmem_base_addr,
+            });
+        } else {
+            p.push_method(cla0c0::SetShaderSharedMemoryWindow {
+                base_address: smem_base_addr,
+            });
+            p.push_method(cla0c0::SetShaderLocalMemoryWindow {
+                base_address: lmem_base_addr,
+            });
+        }
+
+        if self.dev_info().cls_compute >= MAXWELL_COMPUTE_B {
+            p.push_method(clb1c0::InvalidateSkedCaches { v: 0 });
+        }
+
+        p.push_method(cla0c0::SendPcasA {
+            qmd_address_shifted8: (qmd_addr >> 8) as u32,
+        });
+        if self.dev_info().cls_compute >= AMPERE_COMPUTE_A {
+            p.push_method(clc6c0::SendSignalingPcas2B {
+                pcas_action: clc6c0::SendSignalingPcas2BPcasAction::InvalidateCopySchedule,
+            });
+        } else {
+            p.push_method(cla0c0::SendSignalingPcasB {
+                invalidate: true,
+                schedule: true,
+            });
+        }
+
+        let push_addr = bo.addr + u64::try_from(push_offset).unwrap();
+        let push_map = bo.map.offset(push_offset.try_into().unwrap());
+        std::ptr::copy(p.as_ptr(), push_map as *mut u32, p.len());
+
+        let res = self.exec(push_addr, (p.len() * 4).try_into().unwrap());
+
+        // Always copy the data back to the caller, even if exec fails
+        let data_map = bo.map.offset(data_offset.try_into().unwrap());
+        std::ptr::copy(data_map, data, data_size);
+
+        res
+    }
+
+    pub fn run<T>(
+        &self,
+        shader: &nak_shader_bin,
+        data: &mut [T],
+    ) -> io::Result<()> {
+        unsafe {
+            let stride = std::mem::size_of::<T>();
+            self.run_raw(
+                shader,
+                data.len().try_into().unwrap(),
+                stride.try_into().unwrap(),
+                data.as_mut_ptr() as *mut std::os::raw::c_void,
+                data.len() * stride,
+            )
+        }
+    }
+}
+
+unsafe impl Sync for Runner {}
+unsafe impl Send for Runner {}
diff --git a/src/nouveau/meson.build b/src/nouveau/meson.build
index 3aad1053e47..5cbdb2fcde3 100644
--- a/src/nouveau/meson.build
+++ b/src/nouveau/meson.build
@@ -3,10 +3,10 @@
 
 subdir('drm')
 subdir('headers')
+subdir('winsys')
 if with_nouveau_vk
   subdir('compiler')
 endif
-subdir('winsys')
 if with_tools.contains('drm-shim')
   subdir('drm-shim')
 endif