Files
third_party_mesa3d/src/asahi/compiler/agx_opcodes.py
Alyssa Rosenzweig 7ed2596fe7 agx: Implement fence_*_to_tex_agx intrinsics
We need these fencing intrinsics because our image caches aren't coherent with
memory. Furthermore, we need some sync intrinsics for imageblocks (which are
spicy images). These are a stub of what the final fragment shader interlock
implementation will look like, or what a real Metal-grade imageblock
implementation needs, but this is good enough for handling the sync requirements
with spilled render targets.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24258>
2023-07-20 15:33:28 +00:00

407 lines
13 KiB
Python

"""
Copyright 2021 Alyssa Rosenzweig
# SPDX-License-Identifier: MIT
"""
opcodes = {}
immediates = {}
enums = {}
VARIABLE = ~0
class Opcode(object):
def __init__(self, name, dests, srcs, imms, is_float, can_eliminate,
can_reorder, encoding_16, encoding_32):
self.name = name
self.dests = dests if dests != VARIABLE else 0
self.srcs = srcs if srcs != VARIABLE else 0
self.variable_srcs = (srcs == VARIABLE)
self.variable_dests = (dests == VARIABLE)
self.imms = imms
self.is_float = is_float
self.can_eliminate = can_eliminate
self.can_reorder = can_reorder
self.encoding_16 = encoding_16
self.encoding_32 = encoding_32
class Immediate(object):
def __init__(self, name, ctype):
self.name = name
self.ctype = ctype
class Encoding(object):
def __init__(self, description):
(exact, mask, length_short, length_long) = description
# Convenience
if length_long is None:
length_long = length_short
self.exact = exact
self.mask = mask
self.length_short = length_short
self.extensible = length_short != length_long
if self.extensible:
assert(length_long == length_short + (4 if length_short > 8 else 2))
def op(name, encoding_32, dests = 1, srcs = 0, imms = [], is_float = False,
can_eliminate = True, can_reorder = True, encoding_16 = None):
encoding_16 = Encoding(encoding_16) if encoding_16 is not None else None
encoding_32 = Encoding(encoding_32) if encoding_32 is not None else None
opcodes[name] = Opcode(name, dests, srcs, imms, is_float, can_eliminate, can_reorder, encoding_16, encoding_32)
def immediate(name, ctype = "uint32_t"):
imm = Immediate(name, ctype)
immediates[name] = imm
return imm
def enum(name, value_dict):
enums[name] = value_dict
return immediate(name, "enum agx_" + name)
L = (1 << 15)
_ = None
FORMAT = immediate("format", "enum agx_format")
IMM = immediate("imm")
WRITEOUT = immediate("writeout")
INDEX = immediate("index")
COMPONENT = immediate("component")
CHANNELS = immediate("channels")
TRUTH_TABLE = immediate("truth_table")
ROUND = immediate("round", "enum agx_round")
SHIFT = immediate("shift")
MASK = immediate("mask")
BFI_MASK = immediate("bfi_mask")
LOD_MODE = immediate("lod_mode", "enum agx_lod_mode")
PIXEL_OFFSET = immediate("pixel_offset")
DIM = enum("dim", {
0: '1d',
1: '1d_array',
2: '2d',
3: '2d_array',
4: '2d_ms',
5: '3d',
6: 'cube',
7: 'cube_array',
8: '2d_ms_array',
})
GATHER = enum("gather", {
0b000: "none",
0b001: "r",
0b011: "g",
0b101: "b",
0b111: "a",
})
OFFSET = immediate("offset", "bool")
SHADOW = immediate("shadow", "bool")
SCOREBOARD = immediate("scoreboard")
ICOND = immediate("icond", "enum agx_icond")
FCOND = immediate("fcond", "enum agx_fcond")
NEST = immediate("nest")
INVERT_COND = immediate("invert_cond")
NEST = immediate("nest")
TARGET = immediate("target", "agx_block *")
ZS = immediate("zs")
SR = enum("sr", {
0: 'threadgroup_position_in_grid.x',
1: 'threadgroup_position_in_grid.y',
2: 'threadgroup_position_in_grid.z',
4: 'threads_per_threadgroup.x',
5: 'threads_per_threadgroup.y',
6: 'threads_per_threadgroup.z',
8: 'dispatch_threads_per_threadgroup.x',
9: 'dispatch_threads_per_threadgroup.y',
10: 'dispatch_threads_per_threadgroup.z',
20: 'core_index',
21: 'vm_slot',
48: 'thread_position_in_threadgroup.x',
49: 'thread_position_in_threadgroup.y',
50: 'thread_position_in_threadgroup.z',
51: 'thread_index_in_threadgroup',
52: 'thread_index_in_subgroup',
53: 'subgroup_index_in_threadgroup',
56: 'active_thread_index_in_quad',
58: 'active_thread_index_in_subgroup',
62: 'backfacing',
63: 'is_active_thread',
80: 'thread_position_in_grid.x',
81: 'thread_position_in_grid.y',
82: 'thread_position_in_grid.z',
124: 'input_sample_mask',
144: 'opfifo_cmd',
146: 'opfifo_data_l',
147: 'opfifo_data_h',
})
ATOMIC_OPC = enum("atomic_opc", {
0: 'add',
1: 'sub',
2: 'xchg',
3: 'cmpxchg',
4: 'umin',
5: 'imin',
6: 'umax',
7: 'imax',
8: 'and',
9: 'or',
10: 'xor',
})
INTERPOLATION = enum("interpolation", {
0: 'center',
1: 'sample',
2: 'centroid',
# We translate sample -> sample_register at pack time for simplicity
3: 'sample_register',
})
FUNOP = lambda x: (x << 28)
FUNOP_MASK = FUNOP((1 << 14) - 1)
def funop(name, opcode):
op(name, (0x0A | L | (opcode << 28),
0x3F | L | (((1 << 14) - 1) << 28), 6, _),
srcs = 1, is_float = True)
def iunop(name, opcode):
assert(opcode < 4)
op(name, (0x3E | (opcode << 26),
0x7F | L | (((1 << 14) - 1) << 26),
6, _),
srcs = 1)
# Listing of opcodes
funop("floor", 0b000000)
funop("srsqrt", 0b000001)
funop("dfdx", 0b000100)
funop("dfdy", 0b000110)
funop("rcp", 0b001000)
funop("rsqrt", 0b001001)
funop("sin_pt_1", 0b001010)
funop("log2", 0b001100)
funop("exp2", 0b001101)
funop("sin_pt_2", 0b001110)
funop("ceil", 0b010000)
funop("trunc", 0b100000)
funop("roundeven", 0b110000)
iunop("bitrev", 0b01)
iunop("popcount", 0b10)
iunop("ffs", 0b11)
op("fadd",
encoding_16 = (0x26 | L, 0x3F | L, 6, _),
encoding_32 = (0x2A | L, 0x3F | L, 6, _),
srcs = 2, is_float = True)
op("fma",
encoding_16 = (0x36, 0x3F, 6, 8),
encoding_32 = (0x3A, 0x3F, 6, 8),
srcs = 3, is_float = True)
op("fmul",
encoding_16 = ((0x16 | L), (0x3F | L), 6, _),
encoding_32 = ((0x1A | L), (0x3F | L), 6, _),
srcs = 2, is_float = True)
op("mov_imm",
encoding_32 = (0x62, 0xFF, 6, 8),
encoding_16 = (0x62, 0xFF, 4, 6),
imms = [IMM])
op("iadd",
encoding_32 = (0x0E, 0x3F | L, 8, _),
srcs = 2, imms = [SHIFT])
op("imad",
encoding_32 = (0x1E, 0x3F | L, 8, _),
srcs = 3, imms = [SHIFT])
op("bfi",
encoding_32 = (0x2E, 0x7F | (0x3 << 26), 8, _),
srcs = 3, imms = [BFI_MASK])
op("bfeil",
encoding_32 = (0x2E | L, 0x7F | L | (0x3 << 26), 8, _),
srcs = 3, imms = [BFI_MASK])
op("extr",
encoding_32 = (0x2E | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
srcs = 3, imms = [BFI_MASK])
op("asr",
encoding_32 = (0x2E | L | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
srcs = 2)
op("icmpsel",
encoding_32 = (0x12, 0x7F, 8, 10),
srcs = 4, imms = [ICOND])
op("fcmpsel",
encoding_32 = (0x02, 0x7F, 8, 10),
srcs = 4, imms = [FCOND])
# sources are coordinates, LOD, texture bindless base (zero for texture state
# registers), texture, sampler, shadow/offset
# TODO: anything else?
op("texture_sample",
encoding_32 = (0x31, 0x7F, 8, 10), # XXX WRONG SIZE
srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET, SHADOW,
GATHER])
for memory, can_reorder in [("texture", True), ("image", False)]:
op(f"{memory}_load", encoding_32 = (0x71, 0x7F, 8, 10), # XXX WRONG SIZE
srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET],
can_reorder = can_reorder)
# sources are base, index
op("device_load",
encoding_32 = (0x05, 0x7F, 6, 8),
srcs = 2, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_reorder = False)
# sources are base (relative to workgroup memory), index
op("local_load",
encoding_32 = (0b1101001, 0, 6, 8),
srcs = 2, imms = [FORMAT, MASK])
# sources are value, base, index
# TODO: Consider permitting the short form
op("device_store",
encoding_32 = (0x45 | (1 << 47), 0, 8, _),
dests = 0, srcs = 3, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_eliminate = False)
# sources are value, base, index
op("local_store",
encoding_32 = (0b0101001, 0, 6, 8),
dests = 0, srcs = 3, imms = [FORMAT, MASK],
can_eliminate=False)
# sources are value, index
# TODO: Consider permitting the short form
op("uniform_store",
encoding_32 = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _),
dests = 0, srcs = 2, can_eliminate = False)
# sources are value, base, index
op("atomic",
encoding_32 = (0x15 | (1 << 26) | (1 << 31) | (5 << 44), 0x3F | (1 << 26) | (1 << 31) | (5 << 44), 8, _),
dests = 1, srcs = 3, imms = [ATOMIC_OPC, SCOREBOARD], can_eliminate = False)
# XXX: stop hardcoding the long form
op("local_atomic",
encoding_32 = (0x19 | (1 << 15) | (1 << 36) | (1 << 47), 0x3F | (1 << 36) | (1 << 47), 10, _),
dests = 1, srcs = 3, imms = [ATOMIC_OPC], can_eliminate = False)
op("wait", (0x38, 0xFF, 2, _), dests = 0,
can_eliminate = False, imms = [SCOREBOARD])
op("get_sr", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR])
op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 2, can_eliminate = False)
# Sources: sample mask, combined depth/stencil
op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2,
can_eliminate = False, imms = [ZS])
# Essentially same encoding. Last source is the sample mask
op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 1,
imms = [FORMAT, MASK, PIXEL_OFFSET], can_reorder = False)
op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 2,
can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET])
for (name, exact) in [("any", 0xC000), ("none", 0xC200)]:
op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0,
can_eliminate = False, imms = [TARGET])
# TODO: model implicit r0l destinations
op("pop_exec", (0x52 | (0x3 << 9), ((1 << 48) - 1) ^ (0x3 << 7) ^ (0x3 << 11), 6, _),
dests = 0, srcs = 0, can_eliminate = False, imms = [NEST])
for is_float in [False, True]:
mod_mask = 0 if is_float else (0x3 << 26) | (0x3 << 38)
for (cf, cf_op) in [("if", 0), ("else", 1), ("while", 2)]:
name = "{}_{}cmp".format(cf, "f" if is_float else "i")
exact = 0x42 | (0x0 if is_float else 0x10) | (cf_op << 9)
mask = 0x7F | (0x3 << 9) | mod_mask | (0x3 << 44)
imms = [NEST, FCOND if is_float else ICOND, INVERT_COND]
op(name, (exact, mask, 6, _), dests = 0, srcs = 2, can_eliminate = False,
imms = imms, is_float = is_float)
op("bitop", (0x7E, 0x7F, 6, _), srcs = 2, imms = [TRUTH_TABLE])
op("intl", (0x3E, 0x7F, 6, _), srcs = 2, imms = [])
op("convert", (0x3E | L, 0x7F | L | (0x3 << 38), 6, _), srcs = 2, imms = [ROUND])
# Sources are the coeffient register and the sample index (if applicable)
op("iter", (0x21, 0xBF, 8, _), srcs = 2, imms = [CHANNELS, INTERPOLATION])
# Sources are the coeffient register for the varying, the coefficient register
# for W, and the sample index (if applicable)
op("iterproj", (0x21, 0xBF, 8, _), srcs = 3, imms = [CHANNELS, INTERPOLATION])
op("ldcf", (0xA1, 0xBF, 8, _), srcs = 1, imms = [CHANNELS])
op("st_vary", None, dests = 0, srcs = 2, can_eliminate = False)
op("no_varyings", (0x80000051, 0xFFFFFFFF, 4, _), dests = 0, can_eliminate = False)
op("stop", (0x88, 0xFFFF, 2, _), dests = 0, can_eliminate = False)
op("trap", (0x08, 0xFFFF, 2, _), dests = 0, can_eliminate = False)
op("wait_pix", (0x48, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], can_eliminate = False)
op("signal_pix", (0x58, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], can_eliminate = False)
# Sources are the data vector, the coordinate vector, the LOD, the bindless
# table if present (zero for texture state registers), and texture index.
op("image_write", (0xF1 | (1 << 23) | (9 << 43), 0xFF, 6, 8), dests = 0, srcs = 5, imms
= [DIM], can_eliminate = False)
# Sources are the image and the offset within shared memory
# TODO: Do we need the short encoding?
op("block_image_store", (0xB1, 0xFF, 10, _), dests = 0, srcs = 2,
imms = [FORMAT, DIM], can_eliminate = False)
# Barriers
op("threadgroup_barrier", (0x0068, 0xFFFF, 2, _), dests = 0, srcs = 0,
can_eliminate = False)
def memory_barrier(name, a, b, c):
op(name, (0xF5 | (a << 10) | (b << 8) | (c << 12), 0xFFFF, 2, _), dests = 0, srcs = 0,
can_eliminate = False)
memory_barrier("memory_barrier", 1, 2, 9)
# TODO: Not clear what these individually are. Some might be cache flushes?
memory_barrier("image_barrier_1", 2, 2, 10)
memory_barrier("image_barrier_2", 3, 2, 10)
memory_barrier("image_barrier_3", 2, 1, 10)
memory_barrier("image_barrier_4", 3, 1, 10)
memory_barrier("flush_memory_to_texture", 0, 0, 4)
# Convenient aliases.
op("mov", _, srcs = 1)
op("not", _, srcs = 1)
op("xor", _, srcs = 2)
op("and", _, srcs = 2)
op("or", _, srcs = 2)
# Indicates the logical end of the block, before final branches/control flow
op("logical_end", _, dests = 0, srcs = 0, can_eliminate = False)
op("collect", _, srcs = VARIABLE)
op("split", _, srcs = 1, dests = VARIABLE)
op("phi", _, srcs = VARIABLE)
op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
# Like mov, but takes a register and can only appear at the start. Guaranteed
# to be coalesced during RA, rather than lowered to a real move.
op("preload", _, srcs = 1)
# Set the nesting counter. Lowers to mov r0l, x after RA.
op("nest", _, dests = 0, srcs = 1, can_eliminate = False)