nak: Add gpr_limit_from_local_size

I stumbled on this limit - it turns out that large local_sizes apply an additonal limit on gprs per thread. If we violate this limit, then dmesg just gives us a rather unhelpful message that the channel is killed: nouveau 0000:01:00.0: gsp: rc engn:00000001 chid:64 type:13 scope:1 part:233 nouveau 0000:01:00.0: fifo:c00000:0008:0040:[hw_tests::test_[14761]] errored - disabling channel Cc: mesa-stable Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> (cherry picked from commit b99772e71e) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33113>
2025-01-08 17:47:02 -05:00
parent d1c63709cf
commit 0e7cf6c7b5
5 changed files with 55 additions and 4 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -1674,7 +1674,7 @@
        "description": "nak: Add gpr_limit_from_local_size",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": null,
        "notes": null
--- a/src/nouveau/compiler/nak/api.rs
+++ b/src/nouveau/compiler/nak/api.rs
@@ -205,7 +205,7 @@ pub extern "C" fn nak_nir_options(

 #[repr(C)]
 pub struct ShaderBin {
-    bin: nak_shader_bin,
+    pub bin: nak_shader_bin,
    code: Vec<u32>,
    asm: CString,
 }
--- a/src/nouveau/compiler/nak/assign_regs.rs
+++ b/src/nouveau/compiler/nak/assign_regs.rs
@@ -7,7 +7,7 @@ use crate::liveness::{BlockLiveness, Liveness, SimpleLiveness};
 use crate::union_find::UnionFind;

 use compiler::bitset::BitSet;
-use std::cmp::{max, Ordering};
+use std::cmp::{max, min, Ordering};
 use std::collections::{HashMap, HashSet};

 struct KillSet {
@@ -1437,13 +1437,22 @@ impl Shader<'_> {
        let mut gpr_limit = max(max_live[RegFile::GPR], 16);
        let mut total_gprs = gpr_limit + u32::from(tmp_gprs);

-        let max_gprs = if DEBUG.spill() {
+        let mut max_gprs = if DEBUG.spill() {
            // We need at least 16 registers to satisfy RA constraints for
            // texture ops and another 2 for parallel copy lowering
            18
        } else {
            self.sm.num_regs(RegFile::GPR)
        };
+
+        if let ShaderStageInfo::Compute(cs_info) = &self.info.stage {
+            max_gprs = min(
+                max_gprs,
+                gpr_limit_from_local_size(&cs_info.local_size)
+                    - self.sm.hw_reserved_gprs(),
+            );
+        }
+
        if total_gprs > max_gprs {
            // If we're spilling GPRs, we need to reserve 2 GPRs for OpParCopy
            // lowering because it needs to be able lower Mem copies which
--- a/src/nouveau/compiler/nak/hw_tests.rs
+++ b/src/nouveau/compiler/nak/hw_tests.rs
@@ -8,6 +8,7 @@ use crate::sm50::ShaderModel50;
 use crate::sm70::ShaderModel70;

 use acorn::Acorn;
+use compiler::bindings::MESA_SHADER_COMPUTE;
 use compiler::cfg::CFGBuilder;
 use nak_bindings::*;
 use std::str::FromStr;
@@ -1188,3 +1189,25 @@ fn test_f2fp_pack_ab() {
    // { 1.455fp16, 0.0fp16 }
    assert_eq!(data[2][3], 0x3dd24000);
 }
+
+#[test]
+pub fn test_gpr_limit_from_local_size() {
+    let run = RunSingleton::get();
+    let b = TestShaderBuilder::new(run.sm.as_ref());
+    let mut bin = b.compile();
+
+    for local_size in 1..=1024 {
+        let info = &mut bin.bin.info;
+        let cs_info = unsafe {
+            assert_eq!(info.stage, MESA_SHADER_COMPUTE);
+            &mut info.__bindgen_anon_1.cs
+        };
+        cs_info.local_size = [local_size, 1, 1];
+        let num_gprs = gpr_limit_from_local_size(&cs_info.local_size);
+        info.num_gprs = num_gprs.try_into().unwrap();
+
+        run.run.run::<u8>(&bin, &mut [0; 4096]).unwrap_or_else(|_| {
+            panic!("Failed with local_size {local_size}, num_gprs {num_gprs}")
+        });
+    }
+}
--- a/src/nouveau/compiler/nak/ir.rs
+++ b/src/nouveau/compiler/nak/ir.rs
@@ -7259,6 +7259,25 @@ pub trait ShaderModel {
    fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>;
 }

+/// For compute shaders, large values of local_size impose an additional limit
+/// on the number of GPRs per thread
+pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 {
+    fn prev_multiple_of(x: u32, y: u32) -> u32 {
+        (x / y) * y
+    }
+
+    let local_size = local_size[0] * local_size[1] * local_size[2];
+    // Warps are allocated in multiples of 4
+    // Multiply that by 32 threads/warp
+    let local_size = local_size.next_multiple_of(4 * 32) as u32;
+    let total_regs: u32 = 65536;
+
+    let out = total_regs / local_size;
+    // GPRs are allocated in multiples of 8
+    let out = prev_multiple_of(out, 8);
+    min(out, 255)
+}
+
 pub struct Shader<'a> {
    pub sm: &'a dyn ShaderModel,
    pub info: ShaderInfo,