Files
third_party_mesa3d/src/panfrost/bifrost/bi_schedule.c
Alyssa Rosenzweig 90ab3994de pan/bi: Don't allow ATEST to take a temporary
Clause scheduler edition of db2bdc1dc3 ("pan/bi: Require ATEST coverage mask
input in R60"). ATEST wants to read r60, which can't work if its input isn't
even in a register.

When per-sample shading isn't in use, prevents regressions in:

   KHR-GLES31.core.sample_variables.mask.*

These tests previously passed because per-sample shading was forced. It's
not clear whether the bug addressed in this patch is possible to hit "in the
wild", i.e. without the optimizations in this series that allow us to use
per-pixel shading in more cases.

No shader-db changes.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17841>
(cherry picked from commit 394e1f5862)
2022-09-14 10:25:19 -07:00

2081 lines
74 KiB
C

/*
* Copyright (C) 2020 Collabora Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors (Collabora):
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
*/
#include "compiler.h"
#include "bi_builder.h"
/* Arguments common to worklist, passed by value for convenience */
struct bi_worklist {
/* # of instructions in the block */
unsigned count;
/* Instructions in the block */
bi_instr **instructions;
/* Bitset of instructions in the block ready for scheduling */
BITSET_WORD *worklist;
/* The backwards dependency graph. nr_dependencies is the number of
* unscheduled instructions that must still be scheduled after (before)
* this instruction. dependents are which instructions need to be
* scheduled before (after) this instruction. */
unsigned *dep_counts;
BITSET_WORD **dependents;
};
/* State of a single tuple and clause under construction */
struct bi_reg_state {
/* Number of register writes */
unsigned nr_writes;
/* Register reads, expressed as (equivalence classes of)
* sources. Only 3 reads are allowed, but up to 2 may spill as
* "forced" for the next scheduled tuple, provided such a tuple
* can be constructed */
bi_index reads[5];
unsigned nr_reads;
/* The previous tuple scheduled (= the next tuple executed in the
* program) may require certain writes, in order to bypass the register
* file and use a temporary passthrough for the value. Up to 2 such
* constraints are architecturally satisfiable */
unsigned forced_count;
bi_index forceds[2];
};
struct bi_tuple_state {
/* Is this the last tuple in the clause */
bool last;
/* Scheduled ADD instruction, or null if none */
bi_instr *add;
/* Reads for previous (succeeding) tuple */
bi_index prev_reads[5];
unsigned nr_prev_reads;
bi_tuple *prev;
/* Register slot state for current tuple */
struct bi_reg_state reg;
/* Constants are shared in the tuple. If constant_count is nonzero, it
* is a size for constant count. Otherwise, fau is the slot read from
* FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
* but within a tuple, that should be encoded as constant_count != 0
* and constants[0] = constants[1] = 0 */
unsigned constant_count;
union {
uint32_t constants[2];
enum bir_fau fau;
};
unsigned pcrel_idx;
};
struct bi_const_state {
unsigned constant_count;
bool pcrel; /* applies to first const */
uint32_t constants[2];
/* Index of the constant into the clause */
unsigned word_idx;
};
enum bi_ftz_state {
/* No flush-to-zero state assigned yet */
BI_FTZ_STATE_NONE,
/* Never flush-to-zero */
BI_FTZ_STATE_DISABLE,
/* Always flush-to-zero */
BI_FTZ_STATE_ENABLE,
};
struct bi_clause_state {
/* Has a message-passing instruction already been assigned? */
bool message;
/* Indices already accessed, this needs to be tracked to avoid hazards
* around message-passing instructions */
unsigned access_count;
bi_index accesses[(BI_MAX_SRCS + BI_MAX_DESTS) * 16];
unsigned tuple_count;
struct bi_const_state consts[8];
/* Numerical state of the clause */
enum bi_ftz_state ftz;
};
/* Determines messsage type by checking the table and a few special cases. Only
* case missing is tilebuffer instructions that access depth/stencil, which
* require a Z_STENCIL message (to implement
* ARM_shader_framebuffer_fetch_depth_stencil) */
static enum bifrost_message_type
bi_message_type_for_instr(bi_instr *ins)
{
enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
return BIFROST_MESSAGE_Z_STENCIL;
if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
return BIFROST_MESSAGE_ATTRIBUTE;
return msg;
}
/* Attribute, texture, and UBO load (attribute message) instructions support
* bindless, so just check the message type */
ASSERTED static bool
bi_supports_dtsel(bi_instr *ins)
{
switch (bi_message_type_for_instr(ins)) {
case BIFROST_MESSAGE_ATTRIBUTE:
return ins->op != BI_OPCODE_LD_GCLK_U64;
case BIFROST_MESSAGE_TEX:
return true;
default:
return false;
}
}
/* Adds an edge to the dependency graph */
static void
bi_push_dependency(unsigned parent, unsigned child,
BITSET_WORD **dependents, unsigned *dep_counts)
{
if (!BITSET_TEST(dependents[parent], child)) {
BITSET_SET(dependents[parent], child);
dep_counts[child]++;
}
}
static void
add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
BITSET_WORD **dependents, unsigned *dep_counts)
{
assert(index < 64);
util_dynarray_foreach(table + index, unsigned, parent)
bi_push_dependency(*parent, child, dependents, dep_counts);
}
static void
mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
{
assert(index < 64);
util_dynarray_append(&table[index], unsigned, parent);
}
static bool
bi_is_sched_barrier(bi_instr *I)
{
switch (I->op) {
case BI_OPCODE_BARRIER:
case BI_OPCODE_DISCARD_F32:
return true;
default:
return false;
}
}
static void
bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
{
struct util_dynarray last_read[64], last_write[64];
for (unsigned i = 0; i < 64; ++i) {
util_dynarray_init(&last_read[i], NULL);
util_dynarray_init(&last_write[i], NULL);
}
/* Initialize dependency graph */
for (unsigned i = 0; i < st.count; ++i) {
st.dependents[i] =
calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
st.dep_counts[i] = 0;
}
unsigned prev_msg = ~0;
/* Populate dependency graph */
for (signed i = st.count - 1; i >= 0; --i) {
bi_instr *ins = st.instructions[i];
bi_foreach_src(ins, s) {
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
unsigned count = bi_count_read_registers(ins, s);
for (unsigned c = 0; c < count; ++c)
add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
}
/* Keep message-passing ops in order. (This pass only cares
* about bundling; reordering of message-passing instructions
* happens during earlier scheduling.) */
if (bi_message_type_for_instr(ins)) {
if (prev_msg != ~0)
bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
prev_msg = i;
}
/* Handle schedule barriers, adding All the deps */
if (inorder || bi_is_sched_barrier(ins)) {
for (unsigned j = 0; j < st.count; ++j) {
if (i == j) continue;
bi_push_dependency(MAX2(i, j), MIN2(i, j),
st.dependents, st.dep_counts);
}
}
bi_foreach_dest(ins, d) {
if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
unsigned dest = ins->dest[d].value;
unsigned count = bi_count_write_registers(ins, d);
for (unsigned c = 0; c < count; ++c) {
add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
mark_access(last_write, dest + c, i);
}
}
/* Blend shaders are allowed to clobber R0-R15. Treat these
* registers like extra destinations for scheduling purposes.
*/
if (ins->op == BI_OPCODE_BLEND && !is_blend) {
for (unsigned c = 0; c < 16; ++c) {
add_dependency(last_read, c, i, st.dependents, st.dep_counts);
add_dependency(last_write, c, i, st.dependents, st.dep_counts);
mark_access(last_write, c, i);
}
}
bi_foreach_src(ins, s) {
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
unsigned count = bi_count_read_registers(ins, s);
for (unsigned c = 0; c < count; ++c)
mark_access(last_read, ins->src[s].value + c, i);
}
}
/* If there is a branch, all instructions depend on it, as interblock
* execution must be purely in-order */
bi_instr *last = st.instructions[st.count - 1];
if (last->branch_target || last->op == BI_OPCODE_JUMP) {
for (signed i = st.count - 2; i >= 0; --i)
bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
}
/* Free the intermediate structures */
for (unsigned i = 0; i < 64; ++i) {
util_dynarray_fini(&last_read[i]);
util_dynarray_fini(&last_write[i]);
}
}
/* Scheduler pseudoinstruction lowerings to enable instruction pairings.
* Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
*/
static bi_instr *
bi_lower_cubeface(bi_context *ctx,
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
{
bi_instr *pinstr = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
pinstr->src[0], pinstr->src[1], pinstr->src[2]);
pinstr->op = BI_OPCODE_CUBEFACE2;
pinstr->dest[0] = pinstr->dest[1];
pinstr->dest[1] = bi_null();
pinstr->src[0] = cubeface1->dest[0];
pinstr->src[1] = bi_null();
pinstr->src[2] = bi_null();
return cubeface1;
}
/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
* have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
* arguments (rbase, address lo, address hi, rbase) */
static bi_instr *
bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
bi_tuple_state *tuple)
{
bi_instr *pinstr = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
bi_instr *atom_c = bi_atom_c_return_i32(&b,
pinstr->src[1], pinstr->src[2], pinstr->src[0],
pinstr->atom_opc);
if (bi_is_null(pinstr->dest[0]))
atom_c->op = BI_OPCODE_ATOM_C_I32;
pinstr->op = BI_OPCODE_ATOM_CX;
pinstr->src[3] = atom_c->src[2];
return atom_c;
}
static bi_instr *
bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
bi_tuple_state *tuple)
{
bi_instr *pinstr = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
bi_instr *atom_c = bi_atom_c1_return_i32(&b,
pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
if (bi_is_null(pinstr->dest[0]))
atom_c->op = BI_OPCODE_ATOM_C1_I32;
pinstr->op = BI_OPCODE_ATOM_CX;
pinstr->src[2] = pinstr->src[1];
pinstr->src[1] = pinstr->src[0];
pinstr->src[3] = bi_dontcare(&b);
pinstr->src[0] = bi_null();
return atom_c;
}
static bi_instr *
bi_lower_seg_add(bi_context *ctx,
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
{
bi_instr *pinstr = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
pinstr->preserve_null, pinstr->seg);
pinstr->op = BI_OPCODE_SEG_ADD;
pinstr->src[0] = pinstr->src[1];
pinstr->src[1] = bi_null();
assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
pinstr->dest[0].value += 1;
return fma;
}
static bi_instr *
bi_lower_dtsel(bi_context *ctx,
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
{
bi_instr *add = tuple->add;
bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
add->src[0], add->table);
add->src[0] = dtsel->dest[0];
assert(bi_supports_dtsel(add));
return dtsel;
}
/* Flatten linked list to array for O(1) indexing */
static bi_instr **
bi_flatten_block(bi_block *block, unsigned *len)
{
if (list_is_empty(&block->instructions))
return NULL;
*len = list_length(&block->instructions);
bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
unsigned i = 0;
bi_foreach_instr_in_block(block, ins)
instructions[i++] = ins;
return instructions;
}
/* The worklist would track instructions without outstanding dependencies. For
* debug, force in-order scheduling (no dependency graph is constructed).
*/
static struct bi_worklist
bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
{
struct bi_worklist st = { };
st.instructions = bi_flatten_block(block, &st.count);
if (!st.count)
return st;
st.dependents = calloc(st.count, sizeof(st.dependents[0]));
st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
bi_create_dependency_graph(st, inorder, is_blend);
st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
for (unsigned i = 0; i < st.count; ++i) {
if (st.dep_counts[i] == 0)
BITSET_SET(st.worklist, i);
}
return st;
}
static void
bi_free_worklist(struct bi_worklist st)
{
free(st.dep_counts);
free(st.dependents);
free(st.instructions);
free(st.worklist);
}
static void
bi_update_worklist(struct bi_worklist st, unsigned idx)
{
assert(st.dep_counts[idx] == 0);
if (!st.dependents[idx])
return;
/* Iterate each dependent to remove one dependency (`done`),
* adding dependents to the worklist where possible. */
unsigned i;
BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
assert(st.dep_counts[i] != 0);
unsigned new_deps = --st.dep_counts[i];
if (new_deps == 0)
BITSET_SET(st.worklist, i);
}
free(st.dependents[idx]);
}
/* Scheduler predicates */
/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
static bool
bi_can_iaddc(bi_instr *ins)
{
return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
ins->src[0].swizzle == BI_SWIZZLE_H01 &&
ins->src[1].swizzle == BI_SWIZZLE_H01);
}
/*
* The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
* encodings are permitted by swapping operands; however, this scheme fails if
* both operands are equal. Test for this case.
*/
static bool
bi_impacted_abs(bi_instr *I)
{
return I->src[0].abs && I->src[1].abs &&
bi_is_word_equiv(I->src[0], I->src[1]);
}
bool
bi_can_fma(bi_instr *ins)
{
/* +IADD.i32 -> *IADDC.i32 */
if (bi_can_iaddc(ins))
return true;
/* +MUX -> *CSEL */
if (bi_can_replace_with_csel(ins))
return true;
/* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
return false;
/* TODO: some additional fp16 constraints */
return bi_opcode_props[ins->op].fma;
}
static bool
bi_impacted_fadd_widens(bi_instr *I)
{
enum bi_swizzle swz0 = I->src[0].swizzle;
enum bi_swizzle swz1 = I->src[1].swizzle;
return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
}
bool
bi_can_add(bi_instr *ins)
{
/* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
return false;
/* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
return false;
/* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
return false;
/* TODO: some additional fp16 constraints */
return bi_opcode_props[ins->op].add;
}
/* Architecturally, no single instruction has a "not last" constraint. However,
* pseudoinstructions writing multiple destinations (expanding to multiple
* paired instructions) can run afoul of the "no two writes on the last clause"
* constraint, so we check for that here.
*
* Exception to the exception: TEXC, which writes to multiple sets of staging
* registers. Staging registers bypass the usual register write mechanism so
* this restriction does not apply.
*/
static bool
bi_must_not_last(bi_instr *ins)
{
return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]) &&
(ins->op != BI_OPCODE_TEXC);
}
/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
* treat it as a message-passing instruction for the purpose of scheduling
* despite no passing no logical message. Otherwise invalid encoding faults may
* be raised for unknown reasons (possibly an errata).
*/
bool
bi_must_message(bi_instr *ins)
{
return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
(ins->op == BI_OPCODE_DISCARD_F32);
}
static bool
bi_fma_atomic(enum bi_opcode op)
{
switch (op) {
case BI_OPCODE_ATOM_C_I32:
case BI_OPCODE_ATOM_C_I64:
case BI_OPCODE_ATOM_C1_I32:
case BI_OPCODE_ATOM_C1_I64:
case BI_OPCODE_ATOM_C1_RETURN_I32:
case BI_OPCODE_ATOM_C1_RETURN_I64:
case BI_OPCODE_ATOM_C_RETURN_I32:
case BI_OPCODE_ATOM_C_RETURN_I64:
case BI_OPCODE_ATOM_POST_I32:
case BI_OPCODE_ATOM_POST_I64:
case BI_OPCODE_ATOM_PRE_I64:
return true;
default:
return false;
}
}
bool
bi_reads_zero(bi_instr *ins)
{
return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
}
bool
bi_reads_temps(bi_instr *ins, unsigned src)
{
switch (ins->op) {
/* Cannot permute a temporary */
case BI_OPCODE_CLPER_I32:
case BI_OPCODE_CLPER_OLD_I32:
return src != 0;
/* ATEST isn't supposed to be restricted, but in practice it always
* wants to source its coverage mask input (source 0) from register 60,
* which won't work properly if we put the input in a temp. This
* requires workarounds in both RA and clause scheduling.
*/
case BI_OPCODE_ATEST:
return src != 0;
case BI_OPCODE_IMULD:
return false;
default:
return true;
}
}
static bool
bi_impacted_t_modifiers(bi_instr *I, unsigned src)
{
enum bi_swizzle swizzle = I->src[src].swizzle;
switch (I->op) {
case BI_OPCODE_F16_TO_F32:
case BI_OPCODE_F16_TO_S32:
case BI_OPCODE_F16_TO_U32:
case BI_OPCODE_MKVEC_V2I16:
case BI_OPCODE_S16_TO_F32:
case BI_OPCODE_S16_TO_S32:
case BI_OPCODE_U16_TO_F32:
case BI_OPCODE_U16_TO_U32:
return (swizzle != BI_SWIZZLE_H00);
case BI_OPCODE_BRANCH_F32:
case BI_OPCODE_LOGB_F32:
case BI_OPCODE_ILOGB_F32:
case BI_OPCODE_FADD_F32:
case BI_OPCODE_FCMP_F32:
case BI_OPCODE_FREXPE_F32:
case BI_OPCODE_FREXPM_F32:
case BI_OPCODE_FROUND_F32:
return (swizzle != BI_SWIZZLE_H01);
case BI_OPCODE_IADD_S32:
case BI_OPCODE_IADD_U32:
case BI_OPCODE_ISUB_S32:
case BI_OPCODE_ISUB_U32:
case BI_OPCODE_IADD_V4S8:
case BI_OPCODE_IADD_V4U8:
case BI_OPCODE_ISUB_V4S8:
case BI_OPCODE_ISUB_V4U8:
return (src == 1) && (swizzle != BI_SWIZZLE_H01);
case BI_OPCODE_S8_TO_F32:
case BI_OPCODE_S8_TO_S32:
case BI_OPCODE_U8_TO_F32:
case BI_OPCODE_U8_TO_U32:
return (swizzle != BI_SWIZZLE_B0000);
case BI_OPCODE_V2S8_TO_V2F16:
case BI_OPCODE_V2S8_TO_V2S16:
case BI_OPCODE_V2U8_TO_V2F16:
case BI_OPCODE_V2U8_TO_V2U16:
return (swizzle != BI_SWIZZLE_B0022);
case BI_OPCODE_IADD_V2S16:
case BI_OPCODE_IADD_V2U16:
case BI_OPCODE_ISUB_V2S16:
case BI_OPCODE_ISUB_V2U16:
return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
#if 0
/* Restriction on IADD in 64-bit clauses on G72 */
case BI_OPCODE_IADD_S64:
case BI_OPCODE_IADD_U64:
return (src == 1) && (swizzle != BI_SWIZZLE_D0);
#endif
default:
return false;
}
}
bool
bi_reads_t(bi_instr *ins, unsigned src)
{
/* Branch offset cannot come from passthrough */
if (bi_opcode_props[ins->op].branch)
return src != 2;
/* Table can never read passthrough */
if (bi_opcode_props[ins->op].table)
return false;
/* Staging register reads may happen before the succeeding register
* block encodes a write, so effectively there is no passthrough */
if (bi_is_staging_src(ins, src))
return false;
/* Bifrost cores newer than Mali G71 have restrictions on swizzles on
* same-cycle temporaries. Check the list for these hazards. */
if (bi_impacted_t_modifiers(ins, src))
return false;
/* Descriptor must not come from a passthrough */
switch (ins->op) {
case BI_OPCODE_LD_CVT:
case BI_OPCODE_LD_TILE:
case BI_OPCODE_ST_CVT:
case BI_OPCODE_ST_TILE:
case BI_OPCODE_TEXC:
return src != 2;
case BI_OPCODE_BLEND:
return src != 2 && src != 3;
/* +JUMP can't read the offset from T */
case BI_OPCODE_JUMP:
return false;
/* Else, just check if we can read any temps */
default:
return bi_reads_temps(ins, src);
}
}
/* Counts the number of 64-bit constants required by a clause. TODO: We
* might want to account for merging, right now we overestimate, but
* that's probably fine most of the time */
static unsigned
bi_nconstants(struct bi_clause_state *clause)
{
unsigned count_32 = 0;
for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
count_32 += clause->consts[i].constant_count;
return DIV_ROUND_UP(count_32, 2);
}
/* Would there be space for constants if we added one tuple? */
static bool
bi_space_for_more_constants(struct bi_clause_state *clause)
{
return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
}
/* Updates the FAU assignment for a tuple. A valid FAU assignment must be
* possible (as a precondition), though not necessarily on the selected unit;
* this is gauranteed per-instruction by bi_lower_fau and per-tuple by
* bi_instr_schedulable */
static bool
bi_update_fau(struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
bi_instr *instr, bool fma, bool destructive)
{
/* Maintain our own constants, for nondestructive mode */
uint32_t copied_constants[2], copied_count;
unsigned *constant_count = &tuple->constant_count;
uint32_t *constants = tuple->constants;
enum bir_fau fau = tuple->fau;
if (!destructive) {
memcpy(copied_constants, tuple->constants,
(*constant_count) * sizeof(constants[0]));
copied_count = tuple->constant_count;
constant_count = &copied_count;
constants = copied_constants;
}
bi_foreach_src(instr, s) {
bi_index src = instr->src[s];
if (src.type == BI_INDEX_FAU) {
bool no_constants = *constant_count == 0;
bool no_other_fau = (fau == src.value) || !fau;
bool mergable = no_constants && no_other_fau;
if (destructive) {
assert(mergable);
tuple->fau = src.value;
} else if (!mergable) {
return false;
}
fau = src.value;
} else if (src.type == BI_INDEX_CONSTANT) {
/* No need to reserve space if we have a fast 0 */
if (src.value == 0 && fma && bi_reads_zero(instr))
continue;
/* If there is a branch target, #0 by convention is the
* PC-relative offset to the target */
bool pcrel = instr->branch_target && src.value == 0;
bool found = false;
for (unsigned i = 0; i < *constant_count; ++i) {
found |= (constants[i] == src.value) &&
(i != tuple->pcrel_idx);
}
/* pcrel constants are unique, so don't match */
if (found && !pcrel)
continue;
bool no_fau = (*constant_count > 0) || !fau;
bool mergable = no_fau && ((*constant_count) < 2);
if (destructive) {
assert(mergable);
if (pcrel)
tuple->pcrel_idx = *constant_count;
} else if (!mergable)
return false;
constants[(*constant_count)++] = src.value;
}
}
/* Constants per clause may be limited by tuple count */
bool room_for_constants = (*constant_count == 0) ||
bi_space_for_more_constants(clause);
if (destructive)
assert(room_for_constants);
else if (!room_for_constants)
return false;
return true;
}
/* Given an in-progress tuple, a candidate new instruction to add to the tuple,
* and a source (index) from that candidate, determine whether this source is
* "new", in the sense of requiring an additional read slot. That is, checks
* whether the specified source reads from the register file via a read slot
* (determined by its type and placement) and whether the source was already
* specified by a prior read slot (to avoid double counting) */
static bool
bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
{
bi_index src = instr->src[src_idx];
/* Only consider sources which come from the register file */
if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
return false;
/* Staging register reads bypass the usual register file mechanism */
if (bi_is_staging_src(instr, src_idx))
return false;
/* If a source is already read in the tuple, it is already counted */
for (unsigned t = 0; t < reg->nr_reads; ++t)
if (bi_is_word_equiv(src, reg->reads[t]))
return false;
/* If a source is read in _this instruction_, it is already counted */
for (unsigned t = 0; t < src_idx; ++t)
if (bi_is_word_equiv(src, instr->src[t]))
return false;
return true;
}
/* Given two tuples in source order, count the number of register reads of the
* successor, determined as the number of unique words accessed that aren't
* written by the predecessor (since those are tempable).
*/
static unsigned
bi_count_succ_reads(bi_index t0, bi_index t1,
bi_index *succ_reads, unsigned nr_succ_reads)
{
unsigned reads = 0;
for (unsigned i = 0; i < nr_succ_reads; ++i) {
bool unique = true;
for (unsigned j = 0; j < i; ++j)
if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
unique = false;
if (!unique)
continue;
if (bi_is_word_equiv(succ_reads[i], t0))
continue;
if (bi_is_word_equiv(succ_reads[i], t1))
continue;
reads++;
}
return reads;
}
/* Not all instructions can read from the staging passthrough (as determined by
* reads_t), check if a given pair of instructions has such a restriction. Note
* we also use this mechanism to prevent data races around staging register
* reads, so we allow the input source to potentially be vector-valued */
static bool
bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
{
bi_foreach_src(add, s) {
bi_index src = add->src[s];
if (src.type != BI_INDEX_REGISTER)
continue;
unsigned count = bi_count_read_registers(add, s);
bool read = false;
for (unsigned d = 0; d < count; ++d)
read |= bi_is_equiv(fma, bi_register(src.value + d));
if (read && !bi_reads_t(add, s))
return true;
}
return false;
}
/* Likewise for cross-tuple passthrough (reads_temps) */
static bool
bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
{
bi_foreach_instr_in_tuple(succ, pins) {
bi_foreach_src(pins, s) {
if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
!bi_reads_temps(pins, s))
return true;
}
}
return false;
}
/* Is a register written other than the staging mechanism? ATEST is special,
* writing to both a staging register and a regular register (fixed packing).
* BLEND is special since it has to write r48 the normal way even if it never
* gets read. This depends on liveness analysis, as a register is not needed
* for a write that will be discarded after one tuple. */
static unsigned
bi_write_count(bi_instr *instr, uint64_t live_after_temp)
{
if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
return 1;
unsigned count = 0;
bi_foreach_dest(instr, d) {
if (d == 0 && bi_opcode_props[instr->op].sr_write)
continue;
if (bi_is_null(instr->dest[d]))
continue;
assert(instr->dest[0].type == BI_INDEX_REGISTER);
if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
count++;
}
return count;
}
/*
* Test if an instruction required flush-to-zero mode. Currently only supported
* for f16<-->f32 conversions to implement fquantize16
*/
static bool
bi_needs_ftz(bi_instr *I)
{
return (I->op == BI_OPCODE_F16_TO_F32 ||
I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
}
/*
* Test if an instruction would be numerically incompatible with the clause. At
* present we only consider flush-to-zero modes.
*/
static bool
bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
{
return (clause->ftz != BI_FTZ_STATE_NONE) &&
((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
}
/* Instruction placement entails two questions: what subset of instructions in
* the block can legally be scheduled? and of those which is the best? That is,
* we seek to maximize a cost function on a subset of the worklist satisfying a
* particular predicate. The necessary predicate is determined entirely by
* Bifrost's architectural limitations and is described in the accompanying
* whitepaper. The cost function is a heuristic. */
static bool
bi_instr_schedulable(bi_instr *instr,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
/* The units must match */
if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
return false;
/* There can only be one message-passing instruction per clause */
if (bi_must_message(instr) && clause->message)
return false;
/* Some instructions have placement requirements */
if (bi_opcode_props[instr->op].last && !tuple->last)
return false;
if (bi_must_not_last(instr) && tuple->last)
return false;
/* Numerical properties must be compatible with the clause */
if (bi_numerically_incompatible(clause, instr))
return false;
/* Message-passing instructions are not guaranteed write within the
* same clause (most likely they will not), so if a later instruction
* in the clause accesses the destination, the message-passing
* instruction can't be scheduled */
if (bi_opcode_props[instr->op].sr_write) {
bi_foreach_dest(instr, d) {
if (bi_is_null(instr->dest[d]))
continue;
unsigned nr = bi_count_write_registers(instr, d);
assert(instr->dest[d].type == BI_INDEX_REGISTER);
unsigned reg = instr->dest[d].value;
for (unsigned i = 0; i < clause->access_count; ++i) {
bi_index idx = clause->accesses[i];
for (unsigned d = 0; d < nr; ++d) {
if (bi_is_equiv(bi_register(reg + d), idx))
return false;
}
}
}
}
if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
unsigned nr = bi_count_read_registers(instr, 0);
assert(instr->src[0].type == BI_INDEX_REGISTER);
unsigned reg = instr->src[0].value;
for (unsigned i = 0; i < clause->access_count; ++i) {
bi_index idx = clause->accesses[i];
for (unsigned d = 0; d < nr; ++d) {
if (bi_is_equiv(bi_register(reg + d), idx))
return false;
}
}
}
/* If FAU is already assigned, we may not disrupt that. Do a
* non-disruptive test update */
if (!bi_update_fau(clause, tuple, instr, fma, false))
return false;
/* If this choice of FMA would force a staging passthrough, the ADD
* instruction must support such a passthrough */
if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
return false;
/* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
return false;
/* Register file writes are limited */
unsigned total_writes = tuple->reg.nr_writes;
total_writes += bi_write_count(instr, live_after_temp);
/* Last tuple in a clause can only write a single value */
if (tuple->last && total_writes > 1)
return false;
/* Register file reads are limited, so count unique */
unsigned unique_new_srcs = 0;
bi_foreach_src(instr, s) {
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
unique_new_srcs++;
}
unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
bool can_spill_to_moves = (!tuple->add);
can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
can_spill_to_moves &= (clause->tuple_count < 7);
/* However, we can get an extra 1 or 2 sources by inserting moves */
if (total_srcs > (can_spill_to_moves ? 4 : 3))
return false;
/* Count effective reads for the successor */
unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
tuple->add ? tuple->add->dest[0] : bi_null(),
tuple->prev_reads, tuple->nr_prev_reads);
/* Successor must satisfy R+W <= 4, so we require W <= 4-R */
if ((signed) total_writes > (4 - (signed) succ_reads))
return false;
return true;
}
static signed
bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
{
signed cost = 0;
/* Instructions that can schedule to either FMA or to ADD should be
* deprioritized since they're easier to reschedule elsewhere */
if (bi_can_fma(instr) && bi_can_add(instr))
cost++;
/* Message-passing instructions impose constraints on the registers
* later in the clause, so schedule them as late within a clause as
* possible (<==> prioritize them since we're backwards <==> decrease
* cost) */
if (bi_must_message(instr))
cost--;
/* Last instructions are big constraints (XXX: no effect on shader-db) */
if (bi_opcode_props[instr->op].last)
cost -= 2;
return cost;
}
static unsigned
bi_choose_index(struct bi_worklist st,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
unsigned i, best_idx = ~0;
signed best_cost = INT_MAX;
BITSET_FOREACH_SET(i, st.worklist, st.count) {
bi_instr *instr = st.instructions[i];
if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
continue;
signed cost = bi_instr_cost(instr, tuple);
/* Tie break in favour of later instructions, under the
* assumption this promotes temporary usage (reducing pressure
* on the register file). This is a side effect of a prepass
* scheduling for pressure. */
if (cost <= best_cost) {
best_idx = i;
best_cost = cost;
}
}
return best_idx;
}
static void
bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
bi_instr *instr, uint64_t live_after_temp, bool fma)
{
bi_update_fau(clause, tuple, instr, fma, true);
/* TODO: maybe opt a bit? or maybe doesn't matter */
assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
clause->access_count += BI_MAX_SRCS;
memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
clause->access_count += BI_MAX_DESTS;
tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
bi_foreach_src(instr, s) {
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
}
/* This could be optimized to allow pairing integer instructions with
* special flush-to-zero instructions, but punting on this until we have
* a workload that cares.
*/
clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
BI_FTZ_STATE_DISABLE;
}
/* Choose the best instruction and pop it off the worklist. Returns NULL if no
* instruction is available. This function is destructive. */
static bi_instr *
bi_take_instr(bi_context *ctx, struct bi_worklist st,
struct bi_clause_state *clause,
struct bi_tuple_state *tuple,
uint64_t live_after_temp,
bool fma)
{
if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
return bi_lower_cubeface(ctx, clause, tuple);
else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32)
return bi_lower_atom_c(ctx, clause, tuple);
else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32)
return bi_lower_atom_c1(ctx, clause, tuple);
else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
return bi_lower_seg_add(ctx, clause, tuple);
else if (tuple->add && tuple->add->table)
return bi_lower_dtsel(ctx, clause, tuple);
/* TODO: Optimize these moves */
if (!fma && tuple->nr_prev_reads > 3) {
/* Only spill by one source for now */
assert(tuple->nr_prev_reads == 4);
/* Pick a source to spill */
bi_index src = tuple->prev_reads[0];
/* Schedule the spill */
bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
bi_instr *mov = bi_mov_i32_to(&b, src, src);
bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
return mov;
}
#ifndef NDEBUG
/* Don't pair instructions if debugging */
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
return NULL;
#endif
unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
if (idx >= st.count)
return NULL;
/* Update state to reflect taking the instruction */
bi_instr *instr = st.instructions[idx];
BITSET_CLEAR(st.worklist, idx);
bi_update_worklist(st, idx);
bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
/* Fixups */
if (instr->op == BI_OPCODE_IADD_U32 && fma) {
assert(bi_can_iaddc(instr));
instr->op = BI_OPCODE_IADDC_I32;
instr->src[2] = bi_zero();
} else if (fma && bi_can_replace_with_csel(instr)) {
bi_replace_mux_with_csel(instr, false);
}
return instr;
}
/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
* to a passthrough register. If except_sr is true, the staging sources are
* skipped, so staging register reads are not accidentally encoded as
* passthrough (which is impossible) */
static void
bi_use_passthrough(bi_instr *ins, bi_index old,
enum bifrost_packed_src new,
bool except_sr)
{
/* Optional for convenience */
if (!ins || bi_is_null(old))
return;
bi_foreach_src(ins, i) {
if ((i == 0 || i == 4) && except_sr)
continue;
if (bi_is_word_equiv(ins->src[i], old)) {
ins->src[i].type = BI_INDEX_PASS;
ins->src[i].value = new;
ins->src[i].reg = false;
ins->src[i].offset = 0;
}
}
}
/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
* intertuple passthroughs where necessary. Passthroughs are allowed as a
* post-condition of scheduling. Note we rewrite ADD first, FMA second --
* opposite the order of execution. This is deliberate -- if both FMA and ADD
* write to the same logical register, the next executed tuple will get the
* latter result. There's no interference issue under the assumption of correct
* register allocation. */
static void
bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
{
bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
if (prec.add) {
bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
}
if (prec.fma) {
bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
}
}
static void
bi_rewrite_fau_to_pass(bi_tuple *tuple)
{
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
if (ins->src[s].type != BI_INDEX_FAU) continue;
bi_index pass = bi_passthrough(ins->src[s].offset ?
BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
ins->src[s] = bi_replace_index(ins->src[s], pass);
}
}
static void
bi_rewrite_zero(bi_instr *ins, bool fma)
{
bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
bi_foreach_src(ins, s) {
bi_index src = ins->src[s];
if (src.type == BI_INDEX_CONSTANT && src.value == 0)
ins->src[s] = bi_replace_index(src, zero);
}
}
/* Assumes #0 to {T, FAU} rewrite has already occurred */
static void
bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
{
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
uint32_t cons = ins->src[s].value;
ASSERTED bool lo = (cons == (constant & 0xffffffff));
bool hi = (cons == (constant >> 32ull));
/* PC offsets always live in the upper half, set to zero by
* convention before pack time. (This is safe, since if you
* wanted to compare against zero, you would use a BRANCHZ
* instruction instead.) */
if (cons == 0 && ins->branch_target != NULL) {
assert(pcrel);
hi = true;
lo = false;
} else if (pcrel) {
hi = false;
}
assert(lo || hi);
ins->src[s] = bi_replace_index(ins->src[s],
bi_passthrough(hi ? BIFROST_SRC_FAU_HI :
BIFROST_SRC_FAU_LO));
}
}
/* Constructs a constant state given a tuple state. This has the
* postcondition that pcrel applies to the first constant by convention,
* and PC-relative constants will be #0 by convention here, so swap to
* match if needed */
static struct bi_const_state
bi_get_const_state(struct bi_tuple_state *tuple)
{
struct bi_const_state consts = {
.constant_count = tuple->constant_count,
.constants[0] = tuple->constants[0],
.constants[1] = tuple->constants[1],
.pcrel = tuple->add && tuple->add->branch_target,
};
/* pcrel applies to the first constant by convention, and
* PC-relative constants will be #0 by convention here, so swap
* to match if needed */
if (consts.pcrel && consts.constants[0]) {
assert(consts.constant_count == 2);
assert(consts.constants[1] == 0);
consts.constants[1] = consts.constants[0];
consts.constants[0] = 0;
}
return consts;
}
/* Merges constants in a clause, satisfying the following rules, assuming no
* more than one tuple has pcrel:
*
* 1. If a tuple has two constants, they must be packed together. If one is
* pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
* (PC << 32)]. Otherwise choose an arbitrary order.
*
* 4. If a tuple has one constant, it may be shared with an existing
* pair that already contains that constant, or it may be combined with another
* (distinct) tuple of a single constant.
*
* This gaurantees a packing is possible. The next routine handles modification
* related swapping, to satisfy format 12 and the lack of modification for
* tuple count 5/8 in EC0.
*/
static uint64_t
bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
{
/* At this point in the constant merge algorithm, pcrel constants are
* treated as zero, so pcrel implies at least one constants is zero */
assert(!pcrel || (c0 == 0 || c1 == 0));
/* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
uint32_t lo = (c0 == hi) ? c1 : c0;
/* Merge in the selected order */
return lo | (((uint64_t) hi) << 32ull);
}
static unsigned
bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
uint64_t *merged, unsigned *pcrel_pair)
{
unsigned merge_count = 0;
for (unsigned t = 0; t < tuple_count; ++t) {
if (consts[t].constant_count != 2) continue;
unsigned idx = ~0;
uint64_t val = bi_merge_u32(consts[t].constants[0],
consts[t].constants[1], consts[t].pcrel);
/* Skip the pcrel pair if assigned, because if one is assigned,
* this one is not pcrel by uniqueness so it's a mismatch */
for (unsigned s = 0; s < merge_count; ++s) {
if (merged[s] == val && (*pcrel_pair) != s) {
idx = s;
break;
}
}
if (idx == ~0) {
idx = merge_count++;
merged[idx] = val;
if (consts[t].pcrel)
(*pcrel_pair) = idx;
}
consts[t].word_idx = idx;
}
return merge_count;
}
static unsigned
bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
{
bool pending = false, pending_pcrel = false;
uint32_t pending_single = 0;
for (unsigned t = 0; t < tuple_count; ++t) {
if (consts[t].constant_count != 1) continue;
uint32_t val = consts[t].constants[0];
unsigned idx = ~0;
/* Try to match, but don't match pcrel with non-pcrel, even
* though we can merge a pcrel with a non-pcrel single */
for (unsigned i = 0; i < pair_count; ++i) {
bool lo = ((pairs[i] & 0xffffffff) == val);
bool hi = ((pairs[i] >> 32) == val);
bool match = (lo || hi);
match &= ((*pcrel_pair) != i);
if (match && !consts[t].pcrel) {
idx = i;
break;
}
}
if (idx == ~0) {
idx = pair_count;
if (pending && pending_single != val) {
assert(!(pending_pcrel && consts[t].pcrel));
bool pcrel = pending_pcrel || consts[t].pcrel;
if (pcrel)
*pcrel_pair = idx;
pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
pending = pending_pcrel = false;
} else {
pending = true;
pending_pcrel = consts[t].pcrel;
pending_single = val;
}
}
consts[t].word_idx = idx;
}
/* Shift so it works whether pending_pcrel is set or not */
if (pending) {
if (pending_pcrel)
*pcrel_pair = pair_count;
pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
}
return pair_count;
}
static unsigned
bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
{
unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
}
/* Swap two constants at word i and i+1 by swapping their actual positions and
* swapping all references so the meaning of the clause is preserved */
static void
bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
{
uint64_t tmp_pair = pairs[i + 0];
pairs[i + 0] = pairs[i + 1];
pairs[i + 1] = tmp_pair;
for (unsigned t = 0; t < 8; ++t) {
if (consts[t].word_idx == i)
consts[t].word_idx = (i + 1);
else if (consts[t].word_idx == (i + 1))
consts[t].word_idx = i;
}
}
/* Given merged constants, one of which might be PC-relative, fix up the M
* values so the PC-relative constant (if it exists) has the M1=4 modification
* and other constants are used as-is (which might require swapping) */
static unsigned
bi_apply_constant_modifiers(struct bi_const_state *consts,
uint64_t *pairs, unsigned *pcrel_idx,
unsigned tuple_count, unsigned constant_count)
{
unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
/* Clauses with these tuple counts lack an M field for the packed EC0,
* so EC0 cannot be PC-relative, which might require swapping (and
* possibly adding an unused constant) to fit */
if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
constant_count = MAX2(constant_count, 2);
*pcrel_idx = 1;
bi_swap_constants(consts, pairs, 0);
}
/* EC0 might be packed free, after that constants are packed in pairs
* (with clause format 12), with M1 values computed from the pair */
for (unsigned i = start; i < constant_count; i += 2) {
bool swap = false;
bool last = (i + 1) == constant_count;
unsigned A1 = (pairs[i] >> 60);
unsigned B1 = (pairs[i + 1] >> 60);
if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
/* PC-relative constant must be E0, not E1 */
swap = (*pcrel_idx == (i + 1));
/* Set M1 = 4 by noting (A - B) mod 16 = 4 is
* equivalent to A = (B + 4) mod 16 and that we can
* control A */
unsigned B = swap ? A1 : B1;
unsigned A = (B + 4) & 0xF;
pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
/* Swapped if swap set, identity if swap not set */
*pcrel_idx = i;
} else {
/* Compute M1 value if we don't swap */
unsigned M1 = (16 + A1 - B1) & 0xF;
/* For M1 = 0 or M1 >= 8, the constants are unchanged,
* we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
* A1) % 16 >= 8, so swapping will let them be used
* unchanged */
swap = (M1 != 0) && (M1 < 8);
/* However, we can't swap the last constant, so we
* force M1 = 0 instead for this case */
if (last && swap) {
pairs[i + 1] |= pairs[i] & (0xfull << 60);
swap = false;
}
}
if (swap) {
assert(!last);
bi_swap_constants(consts, pairs, i);
}
}
return constant_count;
}
/* Schedule a single clause. If no instructions remain, return NULL. */
static bi_clause *
bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
{
struct bi_clause_state clause_state = { 0 };
bi_clause *clause = rzalloc(ctx, bi_clause);
bi_tuple *tuple = NULL;
const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
/* TODO: Decide flow control better */
clause->flow_control = BIFROST_FLOW_NBTB;
/* The last clause can only write one instruction, so initialize that */
struct bi_reg_state reg_state = {};
bi_index prev_reads[5] = { bi_null() };
unsigned nr_prev_reads = 0;
/* We need to track future liveness. The main *live set tracks what is
* live at the current point int he program we are scheduling, but to
* determine temp eligibility, we instead want what will be live after
* the next tuple in the program. If you scheduled forwards, you'd need
* a crystall ball for this. Luckily we schedule backwards, so we just
* delay updates to the live_after_temp by an extra tuple. */
uint64_t live_after_temp = *live;
uint64_t live_next_tuple = live_after_temp;
do {
struct bi_tuple_state tuple_state = {
.last = (clause->tuple_count == 0),
.reg = reg_state,
.nr_prev_reads = nr_prev_reads,
.prev = tuple,
.pcrel_idx = ~0,
};
assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
unsigned idx = max_tuples - clause->tuple_count - 1;
tuple = &clause->tuples[idx];
if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
unsigned nr = bi_count_read_registers(clause->message, 0);
live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
}
/* Since we schedule backwards, we schedule ADD first */
tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
tuple->add = tuple_state.add;
/* Update liveness from the new instructions */
if (tuple->add)
*live = bi_postra_liveness_ins(*live, tuple->add);
if (tuple->fma)
*live = bi_postra_liveness_ins(*live, tuple->fma);
/* Rotate in the new per-tuple liveness */
live_after_temp = live_next_tuple;
live_next_tuple = *live;
/* We may have a message, but only one per clause */
if (tuple->add && bi_must_message(tuple->add)) {
assert(!clause_state.message);
clause_state.message = true;
clause->message_type =
bi_message_type_for_instr(tuple->add);
clause->message = tuple->add;
/* We don't need to set dependencies for blend shaders
* because the BLEND instruction in the fragment
* shader should have already done the wait */
if (!ctx->inputs->is_blend) {
switch (tuple->add->op) {
case BI_OPCODE_ATEST:
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
break;
case BI_OPCODE_LD_TILE:
case BI_OPCODE_ST_TILE:
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
break;
case BI_OPCODE_BLEND:
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
break;
default:
break;
}
}
}
clause_state.consts[idx] = bi_get_const_state(&tuple_state);
/* Before merging constants, eliminate zeroes, otherwise the
* merging will fight over the #0 that never gets read (and is
* never marked as read by update_fau) */
if (tuple->fma && bi_reads_zero(tuple->fma))
bi_rewrite_zero(tuple->fma, true);
/* Rewrite away FAU, constant write is deferred */
if (!tuple_state.constant_count) {
tuple->fau_idx = tuple_state.fau;
bi_rewrite_fau_to_pass(tuple);
}
/* Use passthrough register for cross-stage accesses. Since
* there are just FMA and ADD stages, that means we rewrite to
* passthrough the sources of the ADD that read from the
* destination of the FMA */
if (tuple->fma) {
bi_use_passthrough(tuple->add, tuple->fma->dest[0],
BIFROST_SRC_STAGE, false);
}
/* Don't add an empty tuple, unless the worklist has nothing
* but a (pseudo)instruction failing to schedule due to a "not
* last instruction" constraint */
int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
bool not_last = (some_instruction > 0) &&
bi_must_not_last(st.instructions[some_instruction - 1]);
bool insert_empty = tuple_state.last && not_last;
if (!(tuple->fma || tuple->add || insert_empty))
break;
clause->tuple_count++;
/* Adding enough tuple might overflow constants */
if (!bi_space_for_more_constants(&clause_state))
break;
#ifndef NDEBUG
/* Don't schedule more than 1 tuple if debugging */
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
break;
#endif
/* Link through the register state */
STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
nr_prev_reads = tuple_state.reg.nr_reads;
clause_state.tuple_count++;
} while(clause->tuple_count < 8);
/* Don't schedule an empty clause */
if (!clause->tuple_count)
return NULL;
/* Before merging, rewrite away any tuples that read only zero */
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
bi_tuple *tuple = &clause->tuples[i];
struct bi_const_state *st = &clause_state.consts[i];
if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
continue;
bi_foreach_instr_in_tuple(tuple, ins)
bi_rewrite_zero(ins, false);
/* Constant has been demoted to FAU, so don't pack it separately */
st->constant_count = 0;
/* Default */
assert(tuple->fau_idx == BIR_FAU_ZERO);
}
uint64_t constant_pairs[8] = { 0 };
unsigned pcrel_idx = ~0;
unsigned constant_words =
bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
constant_words = bi_apply_constant_modifiers(clause_state.consts,
constant_pairs, &pcrel_idx, clause->tuple_count,
constant_words);
clause->pcrel_idx = pcrel_idx;
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
bi_tuple *tuple = &clause->tuples[i];
/* If no constants, leave FAU as it is, possibly defaulting to 0 */
if (clause_state.consts[i].constant_count == 0)
continue;
/* FAU is already handled */
assert(!tuple->fau_idx);
unsigned word_idx = clause_state.consts[i].word_idx;
assert(word_idx <= 8);
/* We could try to merge regardless of bottom bits as well, but
* that's probably diminishing returns */
uint64_t pair = constant_pairs[word_idx];
unsigned lo = pair & 0xF;
tuple->fau_idx = bi_constant_field(word_idx) | lo;
bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
}
clause->constant_count = constant_words;
memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
/* Branches must be last, so this can be factored out */
bi_instr *last = clause->tuples[max_tuples - 1].add;
clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
clause->block = block;
clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
/* We emit in reverse and emitted to the back of the tuples array, so
* move it up front for easy indexing */
memmove(clause->tuples,
clause->tuples + (max_tuples - clause->tuple_count),
clause->tuple_count * sizeof(clause->tuples[0]));
/* Use passthrough register for cross-tuple accesses. Note this is
* after the memmove, so this is forwards. Skip the first tuple since
* there is nothing before it to passthrough */
for (unsigned t = 1; t < clause->tuple_count; ++t)
bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
return clause;
}
static void
bi_schedule_block(bi_context *ctx, bi_block *block)
{
list_inithead(&block->clauses);
/* Copy list to dynamic array */
struct bi_worklist st = bi_initialize_worklist(block,
bifrost_debug & BIFROST_DBG_INORDER,
ctx->inputs->is_blend);
if (!st.count) {
bi_free_worklist(st);
return;
}
/* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
uint64_t live = block->reg_live_out;
/* Schedule as many clauses as needed to fill the block */
bi_clause *u = NULL;
while((u = bi_schedule_clause(ctx, block, st, &live)))
list_add(&u->link, &block->clauses);
/* Back-to-back bit affects only the last clause of a block,
* the rest are implicitly true */
if (!list_is_empty(&block->clauses)) {
bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
if (bi_reconverge_branches(block))
last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
}
/* Reorder instructions to match the new schedule. First remove
* existing instructions and then recreate the list */
bi_foreach_instr_in_block_safe(block, ins) {
list_del(&ins->link);
}
bi_foreach_clause_in_block(block, clause) {
for (unsigned i = 0; i < clause->tuple_count; ++i) {
bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
list_addtail(&ins->link, &block->instructions);
}
}
}
block->scheduled = true;
#ifndef NDEBUG
unsigned i;
bool incomplete = false;
BITSET_FOREACH_SET(i, st.worklist, st.count) {
bi_print_instr(st.instructions[i], stderr);
incomplete = true;
}
if (incomplete)
unreachable("The above instructions failed to schedule.");
#endif
bi_free_worklist(st);
}
static bool
bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
{
bi_index src = ins->src[s];
/* Staging registers can't have FAU accesses */
if (bi_is_staging_src(ins, s))
return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
if (src.type == BI_INDEX_CONSTANT) {
/* Allow fast zero */
if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
return true;
if (!bi_is_null(*fau))
return false;
/* Else, try to inline a constant */
for (unsigned i = 0; i < *cwords; ++i) {
if (src.value == constants[i])
return true;
}
if (*cwords >= 2)
return false;
constants[(*cwords)++] = src.value;
} else if (src.type == BI_INDEX_FAU) {
if (*cwords != 0)
return false;
/* Can only read from one pair of FAU words */
if (!bi_is_null(*fau) && (src.value != fau->value))
return false;
/* If there is a target, we'll need a PC-relative constant */
if (ins->branch_target)
return false;
*fau = src;
}
return true;
}
void
bi_lower_fau(bi_context *ctx)
{
bi_foreach_instr_global_safe(ctx, ins) {
bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
uint32_t constants[2];
unsigned cwords = 0;
bi_index fau = bi_null();
/* ATEST must have the ATEST datum encoded, not any other
* uniform. See to it this is the case. */
if (ins->op == BI_OPCODE_ATEST)
fau = ins->src[2];
bi_foreach_src(ins, s) {
if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
bi_index copy = bi_mov_i32(&b, ins->src[s]);
ins->src[s] = bi_replace_index(ins->src[s], copy);
}
}
}
/* Only v7 allows specifying a dependency on the tilebuffer for the first
* clause of a shader. v6 requires adding a NOP clause with the depedency. */
static void
bi_add_nop_for_atest(bi_context *ctx)
{
/* Only needed on v6 */
if (ctx->arch >= 7)
return;
if (list_is_empty(&ctx->blocks))
return;
/* Fetch the first clause of the shader */
bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
bi_clause *clause = bi_next_clause(ctx, block, NULL);
if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
(1 << BIFROST_SLOT_ELDEST_COLOUR))))
return;
/* Add a NOP so we can wait for the dependencies required by the first
* clause */
bi_instr *I = rzalloc(ctx, bi_instr);
I->op = BI_OPCODE_NOP;
I->dest[0] = bi_null();
bi_clause *new_clause = ralloc(ctx, bi_clause);
*new_clause = (bi_clause) {
.flow_control = BIFROST_FLOW_NBTB,
.next_clause_prefetch = true,
.block = clause->block,
.tuple_count = 1,
.tuples[0] = { .fma = I, },
};
list_add(&new_clause->link, &clause->block->clauses);
}
void
bi_schedule(bi_context *ctx)
{
/* Fed into both scheduling and DCE */
bi_postra_liveness(ctx);
bi_foreach_block(ctx, block) {
bi_schedule_block(ctx, block);
}
bi_opt_dce_post_ra(ctx);
bi_add_nop_for_atest(ctx);
}