717 lines
22 KiB
Rust
717 lines
22 KiB
Rust
// Copyright © 2022 Collabora, Ltd.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
use crate::api::{GetDebugFlags, DEBUG};
|
|
use crate::ir::*;
|
|
|
|
use std::cmp::max;
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::ops::{Index, IndexMut, Range};
|
|
use std::slice;
|
|
|
|
struct RegTracker<T> {
|
|
reg: [T; 255],
|
|
ureg: [T; 63],
|
|
pred: [T; 7],
|
|
upred: [T; 7],
|
|
carry: [T; 1],
|
|
}
|
|
|
|
fn new_array_with<T, const N: usize>(f: &impl Fn() -> T) -> [T; N] {
|
|
let mut v = Vec::new();
|
|
for _ in 0..N {
|
|
v.push(f());
|
|
}
|
|
v.try_into()
|
|
.unwrap_or_else(|_| panic!("Array size mismatch"))
|
|
}
|
|
|
|
impl<T> RegTracker<T> {
|
|
pub fn new_with(f: &impl Fn() -> T) -> Self {
|
|
Self {
|
|
reg: new_array_with(f),
|
|
ureg: new_array_with(f),
|
|
pred: new_array_with(f),
|
|
upred: new_array_with(f),
|
|
carry: new_array_with(f),
|
|
}
|
|
}
|
|
|
|
pub fn for_each_instr_pred_mut(
|
|
&mut self,
|
|
instr: &Instr,
|
|
mut f: impl FnMut(&mut T),
|
|
) {
|
|
if let PredRef::Reg(reg) = &instr.pred.pred_ref {
|
|
for i in &mut self[*reg] {
|
|
f(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn for_each_instr_src_mut(
|
|
&mut self,
|
|
instr: &Instr,
|
|
mut f: impl FnMut(usize, &mut T),
|
|
) {
|
|
for (i, src) in instr.srcs().iter().enumerate() {
|
|
match &src.src_ref {
|
|
SrcRef::Reg(reg) => {
|
|
for t in &mut self[*reg] {
|
|
f(i, t);
|
|
}
|
|
}
|
|
SrcRef::CBuf(CBufRef {
|
|
buf: CBuf::BindlessUGPR(reg),
|
|
..
|
|
}) => {
|
|
for t in &mut self[*reg] {
|
|
f(i, t);
|
|
}
|
|
}
|
|
_ => (),
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn for_each_instr_dst_mut(
|
|
&mut self,
|
|
instr: &Instr,
|
|
mut f: impl FnMut(usize, &mut T),
|
|
) {
|
|
for (i, dst) in instr.dsts().iter().enumerate() {
|
|
if let Dst::Reg(reg) = dst {
|
|
for t in &mut self[*reg] {
|
|
f(i, t);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T> Index<RegRef> for RegTracker<T> {
|
|
type Output = [T];
|
|
|
|
fn index(&self, reg: RegRef) -> &[T] {
|
|
let range = reg.idx_range();
|
|
let range = Range {
|
|
start: usize::try_from(range.start).unwrap(),
|
|
end: usize::try_from(range.end).unwrap(),
|
|
};
|
|
|
|
match reg.file() {
|
|
RegFile::GPR => &self.reg[range],
|
|
RegFile::UGPR => &self.ureg[range],
|
|
RegFile::Pred => &self.pred[range],
|
|
RegFile::UPred => &self.upred[range],
|
|
RegFile::Carry => &self.carry[range],
|
|
RegFile::Bar => &[], // Barriers have a HW scoreboard
|
|
RegFile::Mem => panic!("Not a register"),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T> IndexMut<RegRef> for RegTracker<T> {
|
|
fn index_mut(&mut self, reg: RegRef) -> &mut [T] {
|
|
let range = reg.idx_range();
|
|
let range = Range {
|
|
start: usize::try_from(range.start).unwrap(),
|
|
end: usize::try_from(range.end).unwrap(),
|
|
};
|
|
|
|
match reg.file() {
|
|
RegFile::GPR => &mut self.reg[range],
|
|
RegFile::UGPR => &mut self.ureg[range],
|
|
RegFile::Pred => &mut self.pred[range],
|
|
RegFile::UPred => &mut self.upred[range],
|
|
RegFile::Carry => &mut self.carry[range],
|
|
RegFile::Bar => &mut [], // Barriers have a HW scoreboard
|
|
RegFile::Mem => panic!("Not a register"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
enum RegUse<T: Clone> {
|
|
None,
|
|
Write(T),
|
|
Reads(Vec<T>),
|
|
}
|
|
|
|
impl<T: Clone> RegUse<T> {
|
|
pub fn deps(&self) -> &[T] {
|
|
match self {
|
|
RegUse::None => &[],
|
|
RegUse::Write(dep) => slice::from_ref(dep),
|
|
RegUse::Reads(deps) => &deps[..],
|
|
}
|
|
}
|
|
|
|
pub fn clear(&mut self) -> Self {
|
|
std::mem::replace(self, RegUse::None)
|
|
}
|
|
|
|
pub fn clear_write(&mut self) -> Self {
|
|
if matches!(self, RegUse::Write(_)) {
|
|
std::mem::replace(self, RegUse::None)
|
|
} else {
|
|
RegUse::None
|
|
}
|
|
}
|
|
|
|
pub fn add_read(&mut self, dep: T) -> Self {
|
|
match self {
|
|
RegUse::None => {
|
|
*self = RegUse::Reads(vec![dep]);
|
|
RegUse::None
|
|
}
|
|
RegUse::Write(_) => {
|
|
std::mem::replace(self, RegUse::Reads(vec![dep]))
|
|
}
|
|
RegUse::Reads(reads) => {
|
|
reads.push(dep);
|
|
RegUse::None
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn set_write(&mut self, dep: T) -> Self {
|
|
std::mem::replace(self, RegUse::Write(dep))
|
|
}
|
|
}
|
|
|
|
struct DepNode {
|
|
read_dep: Option<usize>,
|
|
first_wait: Option<(usize, usize)>,
|
|
}
|
|
|
|
struct DepGraph {
|
|
deps: Vec<DepNode>,
|
|
instr_deps: HashMap<(usize, usize), (usize, usize)>,
|
|
instr_waits: HashMap<(usize, usize), Vec<usize>>,
|
|
active: HashSet<usize>,
|
|
}
|
|
|
|
impl DepGraph {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
deps: Vec::new(),
|
|
instr_deps: HashMap::new(),
|
|
instr_waits: HashMap::new(),
|
|
active: HashSet::new(),
|
|
}
|
|
}
|
|
|
|
fn add_new_dep(&mut self, read_dep: Option<usize>) -> usize {
|
|
let dep = self.deps.len();
|
|
self.deps.push(DepNode {
|
|
read_dep: read_dep,
|
|
first_wait: None,
|
|
});
|
|
dep
|
|
}
|
|
|
|
pub fn add_instr(&mut self, block_idx: usize, ip: usize) -> (usize, usize) {
|
|
let rd = self.add_new_dep(None);
|
|
let wr = self.add_new_dep(Some(rd));
|
|
self.instr_deps.insert((block_idx, ip), (rd, wr));
|
|
(rd, wr)
|
|
}
|
|
|
|
pub fn add_signal(&mut self, dep: usize) {
|
|
self.active.insert(dep);
|
|
}
|
|
|
|
pub fn add_waits(
|
|
&mut self,
|
|
block_idx: usize,
|
|
ip: usize,
|
|
mut waits: Vec<usize>,
|
|
) {
|
|
for dep in &waits {
|
|
// A wait on a write automatically waits on the read. By removing
|
|
// it from the active set here we ensure that we don't record any
|
|
// duplicate write/read waits in the retain below.
|
|
if let Some(rd) = &self.deps[*dep].read_dep {
|
|
self.active.remove(rd);
|
|
}
|
|
}
|
|
|
|
waits.retain(|dep| {
|
|
let node = &mut self.deps[*dep];
|
|
if let Some(wait) = node.first_wait {
|
|
// Someone has already waited on this dep
|
|
debug_assert!(!self.active.contains(dep));
|
|
debug_assert!((block_idx, ip) >= wait);
|
|
false
|
|
} else if !self.active.contains(dep) {
|
|
// Even if it doesn't have a use, it may still be deactivated.
|
|
// This can happen if we depend the the destination before any
|
|
// of its sources.
|
|
false
|
|
} else {
|
|
self.deps[*dep].first_wait = Some((block_idx, ip));
|
|
self.active.remove(dep);
|
|
true
|
|
}
|
|
});
|
|
|
|
// Sort for stability. The list of waits may come from a HashSet (see
|
|
// add_barrier()) and so it's not guaranteed stable across Rust
|
|
// versions. This also ensures that everything always waits on oldest
|
|
// dependencies first.
|
|
waits.sort();
|
|
|
|
let _old = self.instr_waits.insert((block_idx, ip), waits);
|
|
debug_assert!(_old.is_none());
|
|
}
|
|
|
|
pub fn add_barrier(&mut self, block_idx: usize, ip: usize) {
|
|
let waits = self.active.iter().cloned().collect();
|
|
self.add_waits(block_idx, ip, waits);
|
|
debug_assert!(self.active.is_empty());
|
|
}
|
|
|
|
pub fn dep_is_waited_after(
|
|
&self,
|
|
dep: usize,
|
|
block_idx: usize,
|
|
ip: usize,
|
|
) -> bool {
|
|
if let Some(wait) = self.deps[dep].first_wait {
|
|
wait > (block_idx, ip)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn get_instr_deps(
|
|
&self,
|
|
block_idx: usize,
|
|
ip: usize,
|
|
) -> (usize, usize) {
|
|
*self.instr_deps.get(&(block_idx, ip)).unwrap()
|
|
}
|
|
|
|
pub fn get_instr_waits(&self, block_idx: usize, ip: usize) -> &[usize] {
|
|
if let Some(waits) = self.instr_waits.get(&(block_idx, ip)) {
|
|
&waits[..]
|
|
} else {
|
|
&[]
|
|
}
|
|
}
|
|
}
|
|
|
|
struct BarAlloc {
|
|
num_bars: u8,
|
|
bar_dep: [usize; 6],
|
|
}
|
|
|
|
impl BarAlloc {
|
|
pub fn new() -> BarAlloc {
|
|
BarAlloc {
|
|
num_bars: 6,
|
|
bar_dep: [usize::MAX; 6],
|
|
}
|
|
}
|
|
|
|
pub fn bar_is_free(&self, bar: u8) -> bool {
|
|
debug_assert!(bar < self.num_bars);
|
|
self.bar_dep[usize::from(bar)] == usize::MAX
|
|
}
|
|
|
|
pub fn set_bar_dep(&mut self, bar: u8, dep: usize) {
|
|
debug_assert!(self.bar_is_free(bar));
|
|
self.bar_dep[usize::from(bar)] = dep;
|
|
}
|
|
|
|
pub fn free_bar(&mut self, bar: u8) {
|
|
debug_assert!(!self.bar_is_free(bar));
|
|
self.bar_dep[usize::from(bar)] = usize::MAX;
|
|
}
|
|
|
|
pub fn try_find_free_bar(&self) -> Option<u8> {
|
|
for bar in 0..self.num_bars {
|
|
if self.bar_is_free(bar) {
|
|
return Some(bar);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
pub fn free_some_bar(&mut self) -> u8 {
|
|
// Get the oldest by looking for the one with the smallest dep
|
|
let mut bar = 0;
|
|
for b in 1..self.num_bars {
|
|
if self.bar_dep[usize::from(b)] < self.bar_dep[usize::from(bar)] {
|
|
bar = b;
|
|
}
|
|
}
|
|
self.free_bar(bar);
|
|
bar
|
|
}
|
|
|
|
pub fn get_bar_for_dep(&self, dep: usize) -> Option<u8> {
|
|
for bar in 0..self.num_bars {
|
|
if self.bar_dep[usize::from(bar)] == dep {
|
|
return Some(bar);
|
|
}
|
|
}
|
|
None
|
|
}
|
|
}
|
|
|
|
fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
|
|
let mut uses = RegTracker::new_with(&|| RegUse::None);
|
|
let mut deps = DepGraph::new();
|
|
|
|
for (bi, b) in f.blocks.iter().enumerate() {
|
|
for (ip, instr) in b.instrs.iter().enumerate() {
|
|
if instr.is_branch() {
|
|
deps.add_barrier(bi, ip);
|
|
} else {
|
|
// Execution predicates are handled immediately and we don't
|
|
// need barriers for them, regardless of whether or not it's a
|
|
// fixed-latency instruction.
|
|
let mut waits = Vec::new();
|
|
uses.for_each_instr_pred_mut(instr, |u| {
|
|
let u = u.clear_write();
|
|
waits.extend_from_slice(u.deps());
|
|
});
|
|
|
|
if instr.has_fixed_latency(sm.sm()) {
|
|
// Delays will cover us here. We just need to make sure
|
|
// that we wait on any uses that we consume.
|
|
uses.for_each_instr_src_mut(instr, |_, u| {
|
|
let u = u.clear_write();
|
|
waits.extend_from_slice(u.deps());
|
|
});
|
|
uses.for_each_instr_dst_mut(instr, |_, u| {
|
|
let u = u.clear();
|
|
waits.extend_from_slice(u.deps());
|
|
});
|
|
} else {
|
|
let (rd, wr) = deps.add_instr(bi, ip);
|
|
uses.for_each_instr_src_mut(instr, |_, u| {
|
|
// Only mark a dep as signaled if we actually have
|
|
// something that shows up in the register file as
|
|
// needing scoreboarding
|
|
deps.add_signal(rd);
|
|
let u = u.add_read(rd);
|
|
waits.extend_from_slice(u.deps());
|
|
});
|
|
uses.for_each_instr_dst_mut(instr, |_, u| {
|
|
// Only mark a dep as signaled if we actually have
|
|
// something that shows up in the register file as
|
|
// needing scoreboarding
|
|
deps.add_signal(wr);
|
|
let u = u.set_write(wr);
|
|
for dep in u.deps() {
|
|
// Don't wait on ourselves
|
|
if *dep != rd {
|
|
waits.push(*dep);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
deps.add_waits(bi, ip, waits);
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut bars = BarAlloc::new();
|
|
|
|
for (bi, b) in f.blocks.iter_mut().enumerate() {
|
|
for (ip, instr) in b.instrs.iter_mut().enumerate() {
|
|
let mut wait_mask = 0_u8;
|
|
for dep in deps.get_instr_waits(bi, ip) {
|
|
if let Some(bar) = bars.get_bar_for_dep(*dep) {
|
|
wait_mask |= 1 << bar;
|
|
bars.free_bar(bar);
|
|
}
|
|
}
|
|
instr.deps.add_wt_bar_mask(wait_mask);
|
|
|
|
if instr.needs_yield() {
|
|
instr.deps.set_yield(true);
|
|
}
|
|
|
|
if instr.has_fixed_latency(sm.sm()) {
|
|
continue;
|
|
}
|
|
|
|
let (rd_dep, wr_dep) = deps.get_instr_deps(bi, ip);
|
|
if deps.dep_is_waited_after(rd_dep, bi, ip) {
|
|
let rd_bar = bars.try_find_free_bar().unwrap_or_else(|| {
|
|
let bar = bars.free_some_bar();
|
|
instr.deps.add_wt_bar(bar);
|
|
bar
|
|
});
|
|
bars.set_bar_dep(rd_bar, rd_dep);
|
|
instr.deps.set_rd_bar(rd_bar);
|
|
}
|
|
if deps.dep_is_waited_after(wr_dep, bi, ip) {
|
|
let wr_bar = bars.try_find_free_bar().unwrap_or_else(|| {
|
|
let bar = bars.free_some_bar();
|
|
instr.deps.add_wt_bar(bar);
|
|
bar
|
|
});
|
|
bars.set_bar_dep(wr_bar, wr_dep);
|
|
instr.deps.set_wr_bar(wr_bar);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn exec_latency(sm: u8, op: &Op) -> u32 {
|
|
match op {
|
|
Op::Bar(_) | Op::MemBar(_) => {
|
|
if sm >= 80 {
|
|
6
|
|
} else {
|
|
5
|
|
}
|
|
}
|
|
Op::CCtl(_op) => {
|
|
// CCTL.C needs 8, CCTL.I needs 11
|
|
11
|
|
}
|
|
_ if sm < 70 && (op.is_crs_push() || op.is_branch()) => {
|
|
// pre-Volta needs a delay for control-flow ops
|
|
13
|
|
}
|
|
// Op::DepBar(_) => 4,
|
|
_ => 1, // TODO: co-issue
|
|
}
|
|
}
|
|
|
|
fn instr_latency(op: &Op, dst_idx: usize) -> u32 {
|
|
let file = match op.dsts_as_slice()[dst_idx] {
|
|
Dst::None => return 0,
|
|
Dst::SSA(vec) => vec.file().unwrap(),
|
|
Dst::Reg(reg) => reg.file(),
|
|
};
|
|
|
|
// This is BS and we know it
|
|
match file {
|
|
RegFile::GPR => 6,
|
|
RegFile::UGPR => 12,
|
|
RegFile::Pred => 13,
|
|
RegFile::UPred => 11,
|
|
RegFile::Bar => 0, // Barriers have a HW scoreboard
|
|
RegFile::Carry => 6,
|
|
RegFile::Mem => panic!("Not a register"),
|
|
}
|
|
}
|
|
|
|
/// Read-after-write latency
|
|
fn raw_latency(
|
|
_sm: u8,
|
|
write: &Op,
|
|
dst_idx: usize,
|
|
_read: &Op,
|
|
_src_idx: usize,
|
|
) -> u32 {
|
|
instr_latency(write, dst_idx)
|
|
}
|
|
|
|
/// Write-after-read latency
|
|
fn war_latency(
|
|
_sm: u8,
|
|
_read: &Op,
|
|
_src_idx: usize,
|
|
_write: &Op,
|
|
_dst_idx: usize,
|
|
) -> u32 {
|
|
// We assume the source gets read in the first 4 cycles. We don't know how
|
|
// quickly the write will happen. This is all a guess.
|
|
4
|
|
}
|
|
|
|
/// Write-after-write latency
|
|
fn waw_latency(
|
|
_sm: u8,
|
|
a: &Op,
|
|
a_dst_idx: usize,
|
|
_b: &Op,
|
|
_b_dst_idx: usize,
|
|
) -> u32 {
|
|
// We know our latencies are wrong so assume the wrote could happen anywhere
|
|
// between 0 and instr_latency(a) cycles
|
|
instr_latency(a, a_dst_idx)
|
|
}
|
|
|
|
/// Predicate read-after-write latency
|
|
fn paw_latency(_sm: u8, _write: &Op, _dst_idx: usize) -> u32 {
|
|
13
|
|
}
|
|
|
|
fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) {
|
|
for b in f.blocks.iter_mut().rev() {
|
|
let mut cycle = 0_u32;
|
|
|
|
// Vector mapping IP to start cycle
|
|
let mut instr_cycle = Vec::new();
|
|
instr_cycle.resize(b.instrs.len(), 0_u32);
|
|
|
|
// Maps registers to RegUse<ip, src_dst_idx>. Predicates are
|
|
// represented by src_idx = usize::MAX.
|
|
let mut uses: RegTracker<RegUse<(usize, usize)>> =
|
|
RegTracker::new_with(&|| RegUse::None);
|
|
|
|
// Map from barrier to last waited cycle
|
|
let mut bars = [0_u32; 6];
|
|
|
|
for ip in (0..b.instrs.len()).rev() {
|
|
let instr = &b.instrs[ip];
|
|
let mut min_start = cycle + exec_latency(sm.sm(), &instr.op);
|
|
if let Some(bar) = instr.deps.rd_bar() {
|
|
min_start = max(min_start, bars[usize::from(bar)] + 2);
|
|
}
|
|
if let Some(bar) = instr.deps.wr_bar() {
|
|
min_start = max(min_start, bars[usize::from(bar)] + 2);
|
|
}
|
|
uses.for_each_instr_dst_mut(instr, |i, u| match u {
|
|
RegUse::None => {
|
|
// We don't know how it will be used but it may be used in
|
|
// the next block so we need at least assume the maximum
|
|
// destination latency from the end of the block.
|
|
let s = instr_latency(&instr.op, i);
|
|
min_start = max(min_start, s);
|
|
}
|
|
RegUse::Write((w_ip, w_dst_idx)) => {
|
|
let s = instr_cycle[*w_ip]
|
|
+ waw_latency(
|
|
sm.sm(),
|
|
&instr.op,
|
|
i,
|
|
&b.instrs[*w_ip].op,
|
|
*w_dst_idx,
|
|
);
|
|
min_start = max(min_start, s);
|
|
}
|
|
RegUse::Reads(reads) => {
|
|
for (r_ip, r_src_idx) in reads {
|
|
let c = instr_cycle[*r_ip];
|
|
let s = if *r_src_idx == usize::MAX {
|
|
c + paw_latency(sm.sm(), &instr.op, i)
|
|
} else {
|
|
c + raw_latency(
|
|
sm.sm(),
|
|
&instr.op,
|
|
i,
|
|
&b.instrs[*r_ip].op,
|
|
*r_src_idx,
|
|
)
|
|
};
|
|
min_start = max(min_start, s);
|
|
}
|
|
}
|
|
});
|
|
uses.for_each_instr_src_mut(instr, |i, u| match u {
|
|
RegUse::None => (),
|
|
RegUse::Write((w_ip, w_dst_idx)) => {
|
|
let s = instr_cycle[*w_ip]
|
|
+ war_latency(
|
|
sm.sm(),
|
|
&instr.op,
|
|
i,
|
|
&b.instrs[*w_ip].op,
|
|
*w_dst_idx,
|
|
);
|
|
min_start = max(min_start, s);
|
|
}
|
|
RegUse::Reads(_) => (),
|
|
});
|
|
|
|
let instr = &mut b.instrs[ip];
|
|
|
|
let delay = min_start - cycle;
|
|
let delay = delay
|
|
.clamp(MIN_INSTR_DELAY.into(), MAX_INSTR_DELAY.into())
|
|
.try_into()
|
|
.unwrap();
|
|
instr.deps.set_delay(delay);
|
|
|
|
instr_cycle[ip] = min_start;
|
|
uses.for_each_instr_pred_mut(instr, |c| {
|
|
c.add_read((ip, usize::MAX));
|
|
});
|
|
uses.for_each_instr_src_mut(instr, |i, c| {
|
|
c.add_read((ip, i));
|
|
});
|
|
uses.for_each_instr_dst_mut(instr, |i, c| {
|
|
c.set_write((ip, i));
|
|
});
|
|
for (bar, c) in bars.iter_mut().enumerate() {
|
|
if instr.deps.wt_bar_mask & (1 << bar) != 0 {
|
|
*c = min_start;
|
|
}
|
|
}
|
|
|
|
cycle = min_start;
|
|
}
|
|
}
|
|
|
|
// It's unclear exactly why but the blob inserts a Nop with a delay of 2
|
|
// after every instruction which has an exec latency. Perhaps it has
|
|
// something to do with .yld? In any case, the extra 2 cycles aren't worth
|
|
// the chance of weird bugs.
|
|
f.map_instrs(|mut instr, _| {
|
|
if matches!(instr.op, Op::SrcBar(_)) {
|
|
instr.op = Op::Nop(OpNop { label: None });
|
|
MappedInstrs::One(instr)
|
|
} else if exec_latency(sm.sm(), &instr.op) > 1 {
|
|
let mut nop = Instr::new_boxed(OpNop { label: None });
|
|
nop.deps.set_delay(2);
|
|
MappedInstrs::Many(vec![instr, nop])
|
|
} else {
|
|
MappedInstrs::One(instr)
|
|
}
|
|
});
|
|
}
|
|
|
|
impl Shader<'_> {
|
|
pub fn assign_deps_serial(&mut self) {
|
|
for f in &mut self.functions {
|
|
for b in &mut f.blocks.iter_mut().rev() {
|
|
let mut wt = 0_u8;
|
|
for instr in &mut b.instrs {
|
|
if matches!(&instr.op, Op::Bar(_))
|
|
|| matches!(&instr.op, Op::BClear(_))
|
|
|| matches!(&instr.op, Op::BSSy(_))
|
|
|| matches!(&instr.op, Op::BSync(_))
|
|
{
|
|
instr.deps.set_yield(true);
|
|
} else if instr.is_branch() {
|
|
instr.deps.add_wt_bar_mask(0x3f);
|
|
} else {
|
|
instr.deps.add_wt_bar_mask(wt);
|
|
if instr.dsts().len() > 0 {
|
|
instr.deps.set_wr_bar(0);
|
|
wt |= 1 << 0;
|
|
}
|
|
if !instr.pred.pred_ref.is_none()
|
|
|| instr.srcs().len() > 0
|
|
{
|
|
instr.deps.set_rd_bar(1);
|
|
wt |= 1 << 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn calc_instr_deps(&mut self) {
|
|
if DEBUG.serial() {
|
|
self.assign_deps_serial();
|
|
} else {
|
|
for f in &mut self.functions {
|
|
assign_barriers(f, self.sm);
|
|
calc_delays(f, self.sm);
|
|
}
|
|
}
|
|
}
|
|
}
|