nak: Add gpr_limit_from_local_size

I stumbled on this limit - it turns out that large local_sizes apply an
additonal limit on gprs per thread. If we violate this limit, then dmesg
just gives us a rather unhelpful message that the channel is killed:

    nouveau 0000:01:00.0: gsp: rc engn:00000001 chid:64 type:13 scope:1 part:233
    nouveau 0000:01:00.0: fifo:c00000:0008:0040:[hw_tests::test_[14761]] errored - disabling channel

Cc: mesa-stable
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
(cherry picked from commit b99772e71e)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33113>
This commit is contained in:
Mel Henning
2025-01-08 17:47:02 -05:00
committed by Dylan Baker
parent d1c63709cf
commit 0e7cf6c7b5
5 changed files with 55 additions and 4 deletions

View File

@@ -1674,7 +1674,7 @@
"description": "nak: Add gpr_limit_from_local_size",
"nominated": true,
"nomination_type": 1,
"resolution": 0,
"resolution": 1,
"main_sha": null,
"because_sha": null,
"notes": null

View File

@@ -205,7 +205,7 @@ pub extern "C" fn nak_nir_options(
#[repr(C)]
pub struct ShaderBin {
bin: nak_shader_bin,
pub bin: nak_shader_bin,
code: Vec<u32>,
asm: CString,
}

View File

@@ -7,7 +7,7 @@ use crate::liveness::{BlockLiveness, Liveness, SimpleLiveness};
use crate::union_find::UnionFind;
use compiler::bitset::BitSet;
use std::cmp::{max, Ordering};
use std::cmp::{max, min, Ordering};
use std::collections::{HashMap, HashSet};
struct KillSet {
@@ -1437,13 +1437,22 @@ impl Shader<'_> {
let mut gpr_limit = max(max_live[RegFile::GPR], 16);
let mut total_gprs = gpr_limit + u32::from(tmp_gprs);
let max_gprs = if DEBUG.spill() {
let mut max_gprs = if DEBUG.spill() {
// We need at least 16 registers to satisfy RA constraints for
// texture ops and another 2 for parallel copy lowering
18
} else {
self.sm.num_regs(RegFile::GPR)
};
if let ShaderStageInfo::Compute(cs_info) = &self.info.stage {
max_gprs = min(
max_gprs,
gpr_limit_from_local_size(&cs_info.local_size)
- self.sm.hw_reserved_gprs(),
);
}
if total_gprs > max_gprs {
// If we're spilling GPRs, we need to reserve 2 GPRs for OpParCopy
// lowering because it needs to be able lower Mem copies which

View File

@@ -8,6 +8,7 @@ use crate::sm50::ShaderModel50;
use crate::sm70::ShaderModel70;
use acorn::Acorn;
use compiler::bindings::MESA_SHADER_COMPUTE;
use compiler::cfg::CFGBuilder;
use nak_bindings::*;
use std::str::FromStr;
@@ -1188,3 +1189,25 @@ fn test_f2fp_pack_ab() {
// { 1.455fp16, 0.0fp16 }
assert_eq!(data[2][3], 0x3dd24000);
}
#[test]
pub fn test_gpr_limit_from_local_size() {
let run = RunSingleton::get();
let b = TestShaderBuilder::new(run.sm.as_ref());
let mut bin = b.compile();
for local_size in 1..=1024 {
let info = &mut bin.bin.info;
let cs_info = unsafe {
assert_eq!(info.stage, MESA_SHADER_COMPUTE);
&mut info.__bindgen_anon_1.cs
};
cs_info.local_size = [local_size, 1, 1];
let num_gprs = gpr_limit_from_local_size(&cs_info.local_size);
info.num_gprs = num_gprs.try_into().unwrap();
run.run.run::<u8>(&bin, &mut [0; 4096]).unwrap_or_else(|_| {
panic!("Failed with local_size {local_size}, num_gprs {num_gprs}")
});
}
}

View File

@@ -7259,6 +7259,25 @@ pub trait ShaderModel {
fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32>;
}
/// For compute shaders, large values of local_size impose an additional limit
/// on the number of GPRs per thread
pub fn gpr_limit_from_local_size(local_size: &[u16; 3]) -> u32 {
fn prev_multiple_of(x: u32, y: u32) -> u32 {
(x / y) * y
}
let local_size = local_size[0] * local_size[1] * local_size[2];
// Warps are allocated in multiples of 4
// Multiply that by 32 threads/warp
let local_size = local_size.next_multiple_of(4 * 32) as u32;
let total_regs: u32 = 65536;
let out = total_regs / local_size;
// GPRs are allocated in multiples of 8
let out = prev_multiple_of(out, 8);
min(out, 255)
}
pub struct Shader<'a> {
pub sm: &'a dyn ShaderModel,
pub info: ShaderInfo,