ir3: optimize subgroup operations using brcst.active
Follow the blob and optimize subgroup operation using brcst.active and getlast when supported. The transformation consists of two parts. First, a NIR transform replaces subgroup operations with a sequence of new brcst_active_ir3 intrinsics followed by a new [type]_clusters_ir3 intrinsic (where type can be reduce, inclusive_scan, or exclusive_scan). The brcst_active_ir3 intrinsic is lowered directly to a brcst.active instruction. The other intrinsics get lowered to a new macro (OPC_SCAN_CLUSTERS_MACRO) which later gets emitted as a loop (using getlast/getone) that iterates all clusters and produces the requested scan result. OPC_SCAN_CLUSTERS_MACRO has a number of optional arguments. First, since the exclusive scan result is not a natural by-product of the loop but has to be calculated explicitly, its destination is optional. This is necessary since adding it unconditionally will produce unused instructions that won't be DCE'd anymore at this point. Second, when performing 32b MUL_U reductions (that expand to multiple instructions), an extra scratch register is necessary. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6387 Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26950>
This commit is contained in:
@@ -1357,6 +1357,15 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
|
||||
# Should be used in the shader preamble.
|
||||
intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE])
|
||||
|
||||
intrinsic("brcst_active_ir3", dest_comp=1, src_comp=[1, 1], bit_sizes=src0,
|
||||
indices=[CLUSTER_SIZE])
|
||||
intrinsic("reduce_clusters_ir3", dest_comp=1, src_comp=[1], bit_sizes=src0,
|
||||
indices=[REDUCTION_OP])
|
||||
intrinsic("inclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1],
|
||||
bit_sizes=src0, indices=[REDUCTION_OP])
|
||||
intrinsic("exclusive_scan_clusters_ir3", dest_comp=1, src_comp=[1, 1],
|
||||
bit_sizes=src0, indices=[REDUCTION_OP])
|
||||
|
||||
# Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
|
||||
# within a blend shader to read/write the raw value from the tile buffer,
|
||||
# without applying any format conversion in the process. If the shader needs
|
||||
|
@@ -193,6 +193,7 @@ static const struct opc_info {
|
||||
OPC(1, OPC_READ_FIRST_MACRO, read_first.macro),
|
||||
OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
|
||||
OPC(1, OPC_SCAN_MACRO, scan.macro),
|
||||
OPC(1, OPC_SCAN_CLUSTERS_MACRO, scan_clusters.macro),
|
||||
OPC(1, OPC_SHPS_MACRO, shps.macro),
|
||||
OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro),
|
||||
|
||||
|
@@ -130,6 +130,7 @@ typedef enum {
|
||||
|
||||
/* Macros that expand to a loop */
|
||||
OPC_SCAN_MACRO = _OPC(1, 58),
|
||||
OPC_SCAN_CLUSTERS_MACRO = _OPC(1, 60),
|
||||
|
||||
/* Macros that expand to an stsc at the start of the preamble.
|
||||
* It loads into const file and should not be optimized in any way.
|
||||
|
@@ -619,11 +619,12 @@ struct ir3_array {
|
||||
struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
|
||||
|
||||
enum ir3_branch_type {
|
||||
IR3_BRANCH_COND, /* condition */
|
||||
IR3_BRANCH_ANY, /* subgroupAny(condition) */
|
||||
IR3_BRANCH_ALL, /* subgroupAll(condition) */
|
||||
IR3_BRANCH_GETONE, /* subgroupElect() */
|
||||
IR3_BRANCH_SHPS, /* preamble start */
|
||||
IR3_BRANCH_COND, /* condition */
|
||||
IR3_BRANCH_ANY, /* subgroupAny(condition) */
|
||||
IR3_BRANCH_ALL, /* subgroupAll(condition) */
|
||||
IR3_BRANCH_GETONE, /* subgroupElect() */
|
||||
IR3_BRANCH_GETLAST, /* getlast.w8 */
|
||||
IR3_BRANCH_SHPS, /* preamble start */
|
||||
};
|
||||
|
||||
struct ir3_block {
|
||||
@@ -2328,6 +2329,7 @@ INSTR1NODST(PREDT)
|
||||
INSTR0(PREDF)
|
||||
INSTR0(PREDE)
|
||||
INSTR0(GETONE)
|
||||
INSTR0(GETLAST)
|
||||
INSTR0(SHPS)
|
||||
INSTR0(SHPE)
|
||||
|
||||
@@ -2481,6 +2483,26 @@ ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
|
||||
return sam;
|
||||
}
|
||||
|
||||
/* brcst.active rx, ry behaves like a conditional move: rx either keeps its
|
||||
* value or is set to ry. In order to model this in SSA form, we add an extra
|
||||
* argument (the initial value of rx) and tie it to the destination.
|
||||
*/
|
||||
static inline struct ir3_instruction *
|
||||
ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
|
||||
struct ir3_instruction *src,
|
||||
struct ir3_instruction *dst_default)
|
||||
{
|
||||
struct ir3_instruction *brcst =
|
||||
ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
|
||||
brcst->cat5.cluster_size = cluster_size;
|
||||
brcst->cat5.type = TYPE_U32;
|
||||
struct ir3_register *brcst_dst = __ssa_dst(brcst);
|
||||
__ssa_src(brcst, src, 0);
|
||||
struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
|
||||
ir3_reg_tie(brcst_dst, default_src);
|
||||
return brcst;
|
||||
}
|
||||
|
||||
/* cat6 instructions: */
|
||||
INSTR0(GETFIBERID)
|
||||
INSTR2(LDLV)
|
||||
|
@@ -1984,6 +1984,111 @@ emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
return create_multidst_mov(ctx->block, dst);
|
||||
}
|
||||
|
||||
static struct ir3_instruction *
|
||||
emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
|
||||
nir_intrinsic_instr *intr)
|
||||
{
|
||||
nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
|
||||
reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
|
||||
unsigned dst_size = intr->def.bit_size;
|
||||
|
||||
bool need_exclusive =
|
||||
intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
|
||||
bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
|
||||
|
||||
/* Note: the shared reg is initialized to the identity, so we need it to
|
||||
* always be 32-bit even when the source isn't because half shared regs are
|
||||
* not supported.
|
||||
*/
|
||||
struct ir3_instruction *identity =
|
||||
create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
|
||||
identity->dsts[0]->flags |= IR3_REG_SHARED;
|
||||
|
||||
/* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
|
||||
* - Shared reg reduction result, must be initialized to the identity
|
||||
* - Inclusive scan result
|
||||
* - (iff exclusive) Exclusive scan result. Conditionally added because
|
||||
* calculating the exclusive value is optional (i.e., not a side-effect of
|
||||
* calculating the inclusive value) and won't be DCE'd anymore at this
|
||||
* point.
|
||||
* - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
|
||||
* most ops but this isn't possible for the 32b mul_u macro since its
|
||||
* destination is clobbered. So conditionally allocate an extra
|
||||
* register in that case.
|
||||
*
|
||||
* Note that the getlast loop this macro expands to iterates over all
|
||||
* clusters. However, for each iteration, not only the fibers in the current
|
||||
* cluster are active but all later ones as well. Since they still need their
|
||||
* sources when their cluster is handled, all destinations interfere with
|
||||
* the sources.
|
||||
*/
|
||||
unsigned ndst = 2 + need_exclusive + need_scratch;
|
||||
unsigned nsrc = 2 + need_exclusive;
|
||||
struct ir3_instruction *scan =
|
||||
ir3_instr_create(ctx->block, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
|
||||
scan->cat1.reduce_op = reduce_op;
|
||||
|
||||
unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
|
||||
if (ir3_bitsize(ctx, dst_size) == 16)
|
||||
dst_flags |= IR3_REG_HALF;
|
||||
|
||||
struct ir3_register *reduce = __ssa_dst(scan);
|
||||
reduce->flags |= IR3_REG_SHARED;
|
||||
struct ir3_register *inclusive = __ssa_dst(scan);
|
||||
inclusive->flags |= dst_flags;
|
||||
|
||||
struct ir3_register *exclusive = NULL;
|
||||
if (need_exclusive) {
|
||||
exclusive = __ssa_dst(scan);
|
||||
exclusive->flags |= dst_flags;
|
||||
}
|
||||
|
||||
if (need_scratch) {
|
||||
struct ir3_register *scratch = __ssa_dst(scan);
|
||||
scratch->flags |= dst_flags;
|
||||
}
|
||||
|
||||
struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
|
||||
ir3_reg_tie(reduce, reduce_init);
|
||||
|
||||
struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
|
||||
__ssa_src(scan, inclusive_src, 0);
|
||||
|
||||
if (need_exclusive) {
|
||||
struct ir3_instruction *exclusive_src =
|
||||
ir3_get_src(ctx, &intr->src[1])[0];
|
||||
__ssa_src(scan, exclusive_src, 0);
|
||||
}
|
||||
|
||||
struct ir3_register *dst;
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_reduce_clusters_ir3:
|
||||
dst = reduce;
|
||||
break;
|
||||
case nir_intrinsic_inclusive_scan_clusters_ir3:
|
||||
dst = inclusive;
|
||||
break;
|
||||
case nir_intrinsic_exclusive_scan_clusters_ir3: {
|
||||
assert(exclusive != NULL);
|
||||
dst = exclusive;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("unknown reduce intrinsic");
|
||||
}
|
||||
|
||||
return create_multidst_mov(ctx->block, dst);
|
||||
}
|
||||
|
||||
static struct ir3_instruction *
|
||||
emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
|
||||
struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
return ir3_BRCST_ACTIVE(ctx->block, nir_intrinsic_cluster_size(intr),
|
||||
brcst_val, default_src);
|
||||
}
|
||||
|
||||
static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
|
||||
|
||||
@@ -2637,6 +2742,16 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
dst[0] = emit_intrinsic_reduce(ctx, intr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_reduce_clusters_ir3:
|
||||
case nir_intrinsic_inclusive_scan_clusters_ir3:
|
||||
case nir_intrinsic_exclusive_scan_clusters_ir3:
|
||||
dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_brcst_active_ir3:
|
||||
dst[0] = emit_intrinsic_brcst_active(ctx, intr);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_preamble_end_ir3: {
|
||||
struct ir3_instruction *instr = ir3_SHPE(ctx->block);
|
||||
instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
|
||||
|
@@ -695,6 +695,7 @@ block_sched(struct ir3 *ir)
|
||||
struct ir3_instruction *br1, *br2;
|
||||
|
||||
if (block->brtype == IR3_BRANCH_GETONE ||
|
||||
block->brtype == IR3_BRANCH_GETLAST ||
|
||||
block->brtype == IR3_BRANCH_SHPS) {
|
||||
/* getone/shps can't be inverted, and it wouldn't even make sense
|
||||
* to follow it with an inverted branch, so follow it by an
|
||||
@@ -703,6 +704,8 @@ block_sched(struct ir3 *ir)
|
||||
assert(!block->condition);
|
||||
if (block->brtype == IR3_BRANCH_GETONE)
|
||||
br1 = ir3_GETONE(block);
|
||||
else if (block->brtype == IR3_BRANCH_GETLAST)
|
||||
br1 = ir3_GETLAST(block);
|
||||
else
|
||||
br1 = ir3_SHPS(block);
|
||||
br1->cat0.target = block->successors[1];
|
||||
@@ -740,6 +743,7 @@ block_sched(struct ir3 *ir)
|
||||
br2->cat0.brtype = BRANCH_ANY;
|
||||
break;
|
||||
case IR3_BRANCH_GETONE:
|
||||
case IR3_BRANCH_GETLAST:
|
||||
case IR3_BRANCH_SHPS:
|
||||
unreachable("can't get here");
|
||||
}
|
||||
|
@@ -22,6 +22,7 @@
|
||||
*/
|
||||
|
||||
#include "ir3.h"
|
||||
#include "ir3_nir.h"
|
||||
#include "util/ralloc.h"
|
||||
|
||||
/* Lower several macro-instructions needed for shader subgroup support that
|
||||
@@ -241,6 +242,7 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
|
||||
case OPC_READ_COND_MACRO:
|
||||
case OPC_SWZ_SHARED_MACRO:
|
||||
case OPC_SCAN_MACRO:
|
||||
case OPC_SCAN_CLUSTERS_MACRO:
|
||||
break;
|
||||
case OPC_READ_FIRST_MACRO:
|
||||
/* Moves to shared registers read the first active fiber, so we can just
|
||||
@@ -313,6 +315,79 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
|
||||
mov_reg(exit, exclusive, reduce);
|
||||
do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive);
|
||||
mov_reg(exit, reduce, inclusive);
|
||||
} else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
|
||||
/* The pseudo-code for the scan macro is:
|
||||
*
|
||||
* while (true) {
|
||||
* body:
|
||||
* scratch = reduce;
|
||||
*
|
||||
* inclusive = inclusive_src OP scratch;
|
||||
*
|
||||
* static if (is exclusive scan)
|
||||
* exclusive = exclusive_src OP scratch
|
||||
*
|
||||
* if (getlast()) {
|
||||
* store:
|
||||
* reduce = inclusive;
|
||||
* if (elect())
|
||||
* break;
|
||||
* } else {
|
||||
* break;
|
||||
* }
|
||||
* }
|
||||
* after_block:
|
||||
*/
|
||||
struct ir3_block *body = ir3_block_create(ir);
|
||||
list_add(&body->node, &before_block->node);
|
||||
|
||||
struct ir3_block *store = ir3_block_create(ir);
|
||||
list_add(&store->node, &body->node);
|
||||
|
||||
link_blocks(before_block, body, 0);
|
||||
|
||||
link_blocks(body, store, 0);
|
||||
link_blocks(body, after_block, 1);
|
||||
body->brtype = IR3_BRANCH_GETLAST;
|
||||
|
||||
link_blocks(store, after_block, 0);
|
||||
link_blocks(store, body, 1);
|
||||
store->brtype = IR3_BRANCH_GETONE;
|
||||
|
||||
struct ir3_register *reduce = instr->dsts[0];
|
||||
struct ir3_register *inclusive = instr->dsts[1];
|
||||
struct ir3_register *inclusive_src = instr->srcs[1];
|
||||
|
||||
/* We need to perform the following operations:
|
||||
* - inclusive = inclusive_src OP reduce
|
||||
* - exclusive = exclusive_src OP reduce (iff exclusive scan)
|
||||
* Since reduce is initially in a shared register, we need to copy it to a
|
||||
* scratch register before performing the operations.
|
||||
*
|
||||
* The scratch register used is:
|
||||
* - an explicitly allocated one if op is 32b mul_u.
|
||||
* - necessary because we cannot do 'foo = foo mul_u bar' since mul_u
|
||||
* clobbers its destination.
|
||||
* - exclusive if this is an exclusive scan (and not 32b mul_u).
|
||||
* - since we calculate inclusive first.
|
||||
* - inclusive otherwise.
|
||||
*
|
||||
* In all cases, this is the last destination.
|
||||
*/
|
||||
struct ir3_register *scratch = instr->dsts[instr->dsts_count - 1];
|
||||
|
||||
mov_reg(body, scratch, reduce);
|
||||
do_reduce(body, instr->cat1.reduce_op, inclusive, inclusive_src, scratch);
|
||||
|
||||
/* exclusive scan */
|
||||
if (instr->srcs_count == 3) {
|
||||
struct ir3_register *exclusive_src = instr->srcs[2];
|
||||
struct ir3_register *exclusive = instr->dsts[2];
|
||||
do_reduce(body, instr->cat1.reduce_op, exclusive, exclusive_src,
|
||||
scratch);
|
||||
}
|
||||
|
||||
mov_reg(store, reduce, inclusive);
|
||||
} else {
|
||||
struct ir3_block *then_block = create_if(ir, before_block, after_block);
|
||||
|
||||
@@ -447,3 +522,65 @@ ir3_lower_subgroups(struct ir3 *ir)
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
static bool
|
||||
filter_scan_reduce(const nir_instr *instr, const void *data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_reduce:
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
lower_scan_reduce(struct nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
unsigned bit_size = intrin->def.bit_size;
|
||||
|
||||
nir_op op = nir_intrinsic_reduction_op(intrin);
|
||||
nir_const_value ident_val = nir_alu_binop_identity(op, bit_size);
|
||||
nir_def *ident = nir_build_imm(b, 1, bit_size, &ident_val);
|
||||
nir_def *inclusive = intrin->src[0].ssa;
|
||||
nir_def *exclusive = ident;
|
||||
|
||||
for (unsigned cluster_size = 2; cluster_size <= 8; cluster_size *= 2) {
|
||||
nir_def *brcst = nir_brcst_active_ir3(b, ident, inclusive,
|
||||
.cluster_size = cluster_size);
|
||||
inclusive = nir_build_alu2(b, op, inclusive, brcst);
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_exclusive_scan)
|
||||
exclusive = nir_build_alu2(b, op, exclusive, brcst);
|
||||
}
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_reduce:
|
||||
return nir_reduce_clusters_ir3(b, inclusive, .reduction_op = op);
|
||||
case nir_intrinsic_inclusive_scan:
|
||||
return nir_inclusive_scan_clusters_ir3(b, inclusive, .reduction_op = op);
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
return nir_exclusive_scan_clusters_ir3(b, inclusive, exclusive,
|
||||
.reduction_op = op);
|
||||
default:
|
||||
unreachable("filtered intrinsic");
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v)
|
||||
{
|
||||
if (!v->compiler->has_getfiberid)
|
||||
return false;
|
||||
|
||||
return nir_shader_lower_instructions(nir, filter_scan_reduce,
|
||||
lower_scan_reduce, NULL);
|
||||
}
|
||||
|
@@ -740,6 +740,8 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
|
||||
progress |= OPT(s, nir_opt_constant_folding);
|
||||
}
|
||||
|
||||
OPT(s, ir3_nir_opt_subgroups, so);
|
||||
|
||||
/* Do the preamble before analysing UBO ranges, because it's usually
|
||||
* higher-value and because it can result in eliminating some indirect UBO
|
||||
* accesses where otherwise we'd have to push the whole range. However we
|
||||
|
@@ -86,6 +86,8 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
|
||||
nir_def *offset,
|
||||
int32_t shift);
|
||||
|
||||
bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);
|
||||
|
||||
static inline nir_intrinsic_instr *
|
||||
ir3_bindless_resource(nir_src src)
|
||||
{
|
||||
|
@@ -137,7 +137,8 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
|
||||
disasm_a3xx_instr_name(instr->opc));
|
||||
}
|
||||
|
||||
if (instr->opc == OPC_SCAN_MACRO) {
|
||||
if (instr->opc == OPC_SCAN_MACRO ||
|
||||
instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
|
||||
switch (instr->cat1.reduce_op) {
|
||||
case REDUCE_OP_ADD_U:
|
||||
mesa_log_stream_printf(stream, ".add.u");
|
||||
@@ -548,6 +549,9 @@ print_block(struct ir3_block *block, int lvl)
|
||||
case IR3_BRANCH_GETONE:
|
||||
mesa_log_stream_printf(stream, "getone ");
|
||||
break;
|
||||
case IR3_BRANCH_GETLAST:
|
||||
mesa_log_stream_printf(stream, "getlast ");
|
||||
break;
|
||||
case IR3_BRANCH_SHPS:
|
||||
mesa_log_stream_printf(stream, "shps ");
|
||||
break;
|
||||
|
@@ -249,6 +249,27 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
|
||||
validate_assert(ctx, reg_class_flags(instr->dsts[1]) ==
|
||||
reg_class_flags(instr->srcs[0]));
|
||||
validate_assert(ctx, reg_class_flags(instr->dsts[2]) == IR3_REG_SHARED);
|
||||
} else if (instr->opc == OPC_SCAN_CLUSTERS_MACRO) {
|
||||
validate_assert(ctx, instr->dsts_count >= 2 && instr->dsts_count < 5);
|
||||
validate_assert(ctx, instr->srcs_count >= 2 && instr->srcs_count < 4);
|
||||
validate_assert(ctx,
|
||||
reg_class_flags(instr->dsts[0]) == IR3_REG_SHARED);
|
||||
validate_assert(ctx, reg_class_flags(instr->dsts[1]) ==
|
||||
reg_class_flags(instr->srcs[1]));
|
||||
|
||||
/* exclusive scan */
|
||||
if (instr->srcs_count == 3) {
|
||||
validate_assert(ctx, instr->dsts_count >= 3);
|
||||
validate_assert(ctx, reg_class_flags(instr->srcs[2]) ==
|
||||
reg_class_flags(instr->srcs[1]));
|
||||
validate_assert(ctx, reg_class_flags(instr->dsts[2]) ==
|
||||
reg_class_flags(instr->srcs[1]));
|
||||
}
|
||||
|
||||
/* scratch register */
|
||||
validate_assert(ctx,
|
||||
reg_class_flags(instr->dsts[instr->dsts_count - 1]) ==
|
||||
reg_class_flags(instr->srcs[1]));
|
||||
} else {
|
||||
foreach_dst (dst, instr)
|
||||
validate_reg_size(ctx, dst, instr->cat1.dst_type);
|
||||
|
Reference in New Issue
Block a user