From cd7128c2e305eb7673dcd1c35baf4f3e41d39d35 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Thu, 18 Jul 2024 11:20:28 -0500 Subject: [PATCH] nak: Add a bare HW shader runner Part-of: --- src/nouveau/compiler/meson.build | 31 +- src/nouveau/compiler/nak_bindings.h | 12 + src/nouveau/compiler/nak_runner/lib.rs | 422 +++++++++++++++++++++++++ src/nouveau/meson.build | 2 +- 4 files changed, 464 insertions(+), 3 deletions(-) create mode 100644 src/nouveau/compiler/nak_runner/lib.rs diff --git a/src/nouveau/compiler/meson.build b/src/nouveau/compiler/meson.build index d82b341985e..0d8e67d7255 100644 --- a/src/nouveau/compiler/meson.build +++ b/src/nouveau/compiler/meson.build @@ -94,6 +94,7 @@ _nak_bindings_rs = rust.bindgen( '--raw-line', '#![allow(non_camel_case_types)]', '--raw-line', '#![allow(non_snake_case)]', '--raw-line', '#![allow(non_upper_case_globals)]', + '--allowlist-type', 'drm.*', '--allowlist-type', 'exec_list', '--allowlist-type', 'exec_node', '--allowlist-type', 'float_controls', @@ -107,17 +108,26 @@ _nak_bindings_rs = rust.bindgen( '--allowlist-type', 'gl_vert_attrib', '--allowlist-type', 'nak_.*', '--allowlist-type', 'nir_.*', + '--allowlist-type', 'nouveau_ws_.*', '--allowlist-type', 'mesa_scope', '--allowlist-type', 'mesa_prim', '--allowlist-type', 'tess_primitive_mode', + '--allowlist-var', 'DRM_.*', '--allowlist-var', 'nir_.*_infos', + '--allowlist-var', 'NVIDIA_VENDOR_ID', + '--allowlist-function', 'drm.*', + '--allowlist-function', 'glsl_.*', '--allowlist-function', '_mesa_shader_stage_to_string', '--allowlist-function', 'nak_.*', '--allowlist-function', 'nir_.*', - '--allowlist-function', 'glsl_.*', + '--allowlist-function', 'nouveau_ws_.*', '--no-prepend-enum-name', ], - dependencies : libnak_deps, + dependencies : [ + dep_libdrm, + idep_nouveau_ws, + libnak_deps, + ], ) _libnak_bindings_rs = static_library( @@ -164,6 +174,23 @@ _libnak_rs = static_library( ) if with_tests + _libnak_runner = static_library( + 'nak_runner', + files('nak_runner/lib.rs'), + gnu_symbol_visibility : 'hidden', + rust_abi : 'rust', + rust_args : nak_rust_args, + dependencies : [ + dep_libdrm, + idep_nvidia_headers_rs, + idep_nv_push_rs, + ], + link_with: [ + _libnak_bindings_rs, + _libnak_qmd_rs, + ], + ) + rust.test('nak', _libnak_rs, suite : ['nouveau']) endif diff --git a/src/nouveau/compiler/nak_bindings.h b/src/nouveau/compiler/nak_bindings.h index fa20b29bcb5..9af1aecef3c 100644 --- a/src/nouveau/compiler/nak_bindings.h +++ b/src/nouveau/compiler/nak_bindings.h @@ -4,3 +4,15 @@ */ #include "nak_private.h" + +#include "nouveau_bo.h" +#include "nouveau_context.h" +#include "nouveau_device.h" + +#include +#include "drm-uapi/nouveau_drm.h" + +#define DRM_RS_IOCTL(FOO) \ + static const unsigned long DRM_RS_IOCTL_##FOO = DRM_IOCTL_##FOO + +DRM_RS_IOCTL(NOUVEAU_EXEC); diff --git a/src/nouveau/compiler/nak_runner/lib.rs b/src/nouveau/compiler/nak_runner/lib.rs new file mode 100644 index 00000000000..7bc3263bf6d --- /dev/null +++ b/src/nouveau/compiler/nak_runner/lib.rs @@ -0,0 +1,422 @@ +// Copyright © 2022 Collabora, Ltd. +// SPDX-License-Identifier: MIT + +use nak_bindings::*; +use nv_push_rs::Push as NvPush; +use nvidia_headers::classes::cla0c0::mthd as cla0c0; +use nvidia_headers::classes::clb1c0::mthd as clb1c0; +use nvidia_headers::classes::clb1c0::MAXWELL_COMPUTE_B; +use nvidia_headers::classes::clc3c0::mthd as clc3c0; +use nvidia_headers::classes::clc3c0::VOLTA_COMPUTE_A; +use nvidia_headers::classes::clc6c0::mthd as clc6c0; +use nvidia_headers::classes::clc6c0::AMPERE_COMPUTE_A; + +use std::io; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Mutex; + +unsafe fn is_nvidia_device(dev: drmDevicePtr) -> bool { + match (*dev).bustype as u32 { + DRM_BUS_PCI => { + let pci = &*(*dev).deviceinfo.pci; + pci.vendor_id == (NVIDIA_VENDOR_ID as u16) + } + _ => false, + } +} + +#[repr(C)] +pub struct CB0 { + pub data_addr_lo: u32, + pub data_addr_hi: u32, + pub data_stride: u32, + pub invocations: u32, +} + +struct BO<'a> { + run: &'a Runner, + bo: NonNull, + pub addr: u64, + pub map: *mut std::os::raw::c_void, +} + +impl<'a> BO<'a> { + fn new(run: &'a Runner, size: u64) -> io::Result> { + let size = size.next_multiple_of(4096); + + let mut map: *mut std::os::raw::c_void = std::ptr::null_mut(); + let bo = unsafe { + nouveau_ws_bo_new_mapped( + run.dev.as_ptr(), + size, + 0, // align + NOUVEAU_WS_BO_GART, + NOUVEAU_WS_BO_RDWR, + &mut map as *mut _, + ) + }; + let Some(bo) = NonNull::new(bo) else { + return Err(io::Error::last_os_error()); + }; + assert!(!map.is_null()); + + let addr = run.next_addr.fetch_add(size, Ordering::Relaxed); + assert!(addr % 4096 == 0); + + unsafe { + nouveau_ws_bo_bind_vma( + run.dev.as_ptr(), + bo.as_ptr(), + addr, + size, + 0, // bo_offset + 0, // pte_kind + ); + } + + Ok(BO { run, bo, addr, map }) + } +} + +impl Drop for BO<'_> { + fn drop(&mut self) { + unsafe { + nouveau_ws_bo_unbind_vma( + self.run.dev.as_ptr(), + self.addr, + self.bo.as_ref().size, + ); + nouveau_ws_bo_destroy(self.bo.as_ptr()); + } + } +} + +pub struct Runner { + dev: NonNull, + ctx: NonNull, + syncobj: u32, + sync_value: Mutex, + next_addr: AtomicU64, +} + +impl<'a> Runner { + pub fn new(dev_id: Option) -> Runner { + unsafe { + let mut drm_devices: [drmDevicePtr; 16] = std::mem::zeroed(); + let num_drm_devices = drmGetDevices( + drm_devices.as_mut_ptr(), + drm_devices.len().try_into().unwrap(), + ); + + assert!(num_drm_devices >= 0, "Failed to enumerate DRM devices"); + let num_drm_devices: usize = num_drm_devices.try_into().unwrap(); + + let drm_dev = if let Some(dev_id) = dev_id { + assert!(dev_id < num_drm_devices, "Unknown device {dev_id}"); + assert!( + is_nvidia_device(drm_devices[dev_id]), + "Device {dev_id} is not an NVIDIA device", + ); + drm_devices[dev_id] + } else { + *drm_devices + .iter() + .find(|dev| is_nvidia_device(**dev)) + .expect("Failed to find an NVIDIA device") + }; + + let dev = nouveau_ws_device_new(drm_dev); + let dev = + NonNull::new(dev).expect("Failed to create nouveau device"); + + drmFreeDevices( + drm_devices.as_mut_ptr(), + num_drm_devices.try_into().unwrap(), + ); + + let mut ctx: *mut nouveau_ws_context = std::ptr::null_mut(); + let err = nouveau_ws_context_create( + dev.as_ptr(), + NOUVEAU_WS_ENGINE_COMPUTE, + &mut ctx, + ); + assert!(err == 0, "Failed to create nouveau context"); + let ctx = NonNull::new(ctx).unwrap(); + + let mut syncobj = 0_u32; + let err = drmSyncobjCreate(dev.as_ref().fd, 0, &mut syncobj); + assert!(err == 0, "Failed to create syncobj"); + + Runner { + dev, + ctx, + syncobj, + sync_value: Mutex::new(0), + next_addr: AtomicU64::new(1 << 16), + } + } + } + + pub fn dev_info(&self) -> &nv_device_info { + unsafe { &self.dev.as_ref().info } + } + + fn exec(&self, addr: u64, len: u16) -> io::Result<()> { + let sync_value = unsafe { + let mut sync_value = self.sync_value.lock().unwrap(); + *sync_value += 1; + + let push = drm_nouveau_exec_push { + va: addr, + va_len: len.into(), + flags: 0, + }; + let sig = drm_nouveau_sync { + flags: DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ, + handle: self.syncobj, + timeline_value: *sync_value, + }; + let exec = drm_nouveau_exec { + channel: self.ctx.as_ref().channel as u32, + wait_count: 0, + wait_ptr: 0, + push_count: 1, + push_ptr: &push as *const _ as u64, + sig_count: 1, + sig_ptr: &sig as *const _ as u64, + }; + let err = drmIoctl( + self.dev.as_ref().fd, + DRM_RS_IOCTL_NOUVEAU_EXEC, + &exec as *const _ as *mut std::os::raw::c_void, + ); + if err != 0 { + return Err(io::Error::last_os_error()); + } + *sync_value + }; + // The close of this unsafe { } drops the lock + + unsafe { + let err = drmSyncobjTimelineWait( + self.dev.as_ref().fd, + &self.syncobj as *const _ as *mut _, + &sync_value as *const _ as *mut _, + 1, // num_handles + i64::MAX, // timeout_nsec + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, + std::ptr::null_mut(), + ); + if err != 0 { + return Err(io::Error::last_os_error()); + } + + // Exec again to check for errors + let exec = drm_nouveau_exec { + channel: self.ctx.as_ref().channel as u32, + wait_count: 0, + wait_ptr: 0, + push_count: 0, + push_ptr: 0, + sig_count: 0, + sig_ptr: 0, + }; + let err = drmIoctl( + self.dev.as_ref().fd, + DRM_RS_IOCTL_NOUVEAU_EXEC, + &exec as *const _ as *mut std::os::raw::c_void, + ); + if err != 0 { + return Err(io::Error::last_os_error()); + } + } + + Ok(()) + } + + pub unsafe fn run_raw( + &self, + shader: &nak_shader_bin, + invocations: u32, + data_stride: u32, + data: *mut std::os::raw::c_void, + data_size: usize, + ) -> io::Result<()> { + assert!(shader.info.stage == MESA_SHADER_COMPUTE); + let cs_info = &shader.info.__bindgen_anon_1.cs; + assert!(cs_info.local_size[1] == 1 && cs_info.local_size[2] == 1); + let local_size = cs_info.local_size[0]; + + // Compute the needed size of the buffer + let mut size = 0_usize; + + const MAX_PUSH_DW: usize = 256; + let push_offset = size; + size = push_offset + 4 * MAX_PUSH_DW; + + const QMD_SIZE: usize = 64 * 4; + let qmd_offset = size.next_multiple_of(0x100); + size = qmd_offset + 4 * QMD_SIZE; + + let shader_offset = size.next_multiple_of(0x80); + size = shader_offset + usize::try_from(shader.code_size).unwrap(); + + let cb0_offset = size.next_multiple_of(256); + size = cb0_offset + std::mem::size_of::(); + + let data_offset = size.next_multiple_of(256); + size = data_offset + data_size; + + let bo = BO::new(self, size.try_into().unwrap())?; + + // Copy the data from the caller into our BO + let data_addr = bo.addr + u64::try_from(data_offset).unwrap(); + let data_map = bo.map.offset(data_offset.try_into().unwrap()); + std::ptr::copy(data, data_map, data_size); + + // Fill out cb0 + let cb0_addr = bo.addr + u64::try_from(cb0_offset).unwrap(); + let cb0_map = bo.map.offset(cb0_offset.try_into().unwrap()); + (cb0_map as *mut CB0).write(CB0 { + data_addr_lo: data_addr as u32, + data_addr_hi: (data_addr >> 32) as u32, + data_stride, + invocations, + }); + + // Upload the shader + let shader_addr = bo.addr + u64::try_from(shader_offset).unwrap(); + let shader_map = bo.map.offset(shader_offset.try_into().unwrap()); + std::ptr::copy( + shader.code, + shader_map, + shader.code_size.try_into().unwrap(), + ); + + // Populate and upload the QMD + let mut qmd_cbufs: [nak_qmd_cbuf; 8] = unsafe { std::mem::zeroed() }; + qmd_cbufs[0] = nak_qmd_cbuf { + index: 0, + size: std::mem::size_of::() + .next_multiple_of(256) + .try_into() + .unwrap(), + addr: cb0_addr, + }; + let qmd_info = nak_qmd_info { + // Pre-Volta, we set the program region to the start of the bo + addr: if self.dev_info().cls_compute < VOLTA_COMPUTE_A { + shader_offset.try_into().unwrap() + } else { + shader_addr + }, + smem_size: 0, + smem_max: 48 * 1024, + global_size: [invocations.div_ceil(local_size.into()), 1, 1], + num_cbufs: 1, + cbufs: qmd_cbufs, + }; + + let qmd_addr = bo.addr + u64::try_from(qmd_offset).unwrap(); + let qmd_map = bo.map.offset(qmd_offset.try_into().unwrap()); + nak_fill_qmd( + self.dev_info(), + &shader.info, + &qmd_info, + qmd_map, + QMD_SIZE, + ); + + // Fill out the pushbuf + let mut p = NvPush::new(); + + p.push_method(cla0c0::SetObject { + class_id: self.dev_info().cls_compute.into(), + engine_id: 0, + }); + if self.dev_info().cls_compute < VOLTA_COMPUTE_A { + p.push_method(cla0c0::SetProgramRegionA { + address_upper: (bo.addr >> 32) as u32, + }); + p.push_method(cla0c0::SetProgramRegionB { + address_lower: bo.addr as u32, + }); + } + + let smem_base_addr = 0xfe000000_u32; + let lmem_base_addr = 0xff000000_u32; + if self.dev_info().cls_compute >= VOLTA_COMPUTE_A { + p.push_method(clc3c0::SetShaderSharedMemoryWindowA { + base_address_upper: 0, + }); + p.push_method(clc3c0::SetShaderSharedMemoryWindowB { + base_address: smem_base_addr, + }); + + p.push_method(clc3c0::SetShaderLocalMemoryWindowA { + base_address_upper: 0, + }); + p.push_method(clc3c0::SetShaderLocalMemoryWindowB { + base_address: lmem_base_addr, + }); + } else { + p.push_method(cla0c0::SetShaderSharedMemoryWindow { + base_address: smem_base_addr, + }); + p.push_method(cla0c0::SetShaderLocalMemoryWindow { + base_address: lmem_base_addr, + }); + } + + if self.dev_info().cls_compute >= MAXWELL_COMPUTE_B { + p.push_method(clb1c0::InvalidateSkedCaches { v: 0 }); + } + + p.push_method(cla0c0::SendPcasA { + qmd_address_shifted8: (qmd_addr >> 8) as u32, + }); + if self.dev_info().cls_compute >= AMPERE_COMPUTE_A { + p.push_method(clc6c0::SendSignalingPcas2B { + pcas_action: clc6c0::SendSignalingPcas2BPcasAction::InvalidateCopySchedule, + }); + } else { + p.push_method(cla0c0::SendSignalingPcasB { + invalidate: true, + schedule: true, + }); + } + + let push_addr = bo.addr + u64::try_from(push_offset).unwrap(); + let push_map = bo.map.offset(push_offset.try_into().unwrap()); + std::ptr::copy(p.as_ptr(), push_map as *mut u32, p.len()); + + let res = self.exec(push_addr, (p.len() * 4).try_into().unwrap()); + + // Always copy the data back to the caller, even if exec fails + let data_map = bo.map.offset(data_offset.try_into().unwrap()); + std::ptr::copy(data_map, data, data_size); + + res + } + + pub fn run( + &self, + shader: &nak_shader_bin, + data: &mut [T], + ) -> io::Result<()> { + unsafe { + let stride = std::mem::size_of::(); + self.run_raw( + shader, + data.len().try_into().unwrap(), + stride.try_into().unwrap(), + data.as_mut_ptr() as *mut std::os::raw::c_void, + data.len() * stride, + ) + } + } +} + +unsafe impl Sync for Runner {} +unsafe impl Send for Runner {} diff --git a/src/nouveau/meson.build b/src/nouveau/meson.build index 3aad1053e47..5cbdb2fcde3 100644 --- a/src/nouveau/meson.build +++ b/src/nouveau/meson.build @@ -3,10 +3,10 @@ subdir('drm') subdir('headers') +subdir('winsys') if with_nouveau_vk subdir('compiler') endif -subdir('winsys') if with_tools.contains('drm-shim') subdir('drm-shim') endif