aco: optimize more uniform reductions/scans
Uniform atomic optimization will create these. Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6558>
This commit is contained in:
@@ -349,6 +349,10 @@ public:
|
|||||||
return def;
|
return def;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Definition hint_${fixed}(RegClass rc) {
|
||||||
|
return hint_${fixed}(def(rc));
|
||||||
|
}
|
||||||
|
|
||||||
% endfor
|
% endfor
|
||||||
/* hand-written helpers */
|
/* hand-written helpers */
|
||||||
Temp as_uniform(Op op)
|
Temp as_uniform(Op op)
|
||||||
|
@@ -122,23 +122,27 @@ Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
|
|||||||
return ctx->allocated[def->index];
|
return ctx->allocated[def->index];
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp emit_mbcnt(isel_context *ctx, Temp dst, Temp mask = Temp(), Operand base = Operand(0u))
|
Temp emit_mbcnt(isel_context *ctx, Temp dst, Operand mask = Operand(), Operand base = Operand(0u))
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
assert(mask.id() == 0 || mask.regClass() == bld.lm);
|
assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
|
||||||
|
assert(mask.isUndefined() || mask.regClass() == bld.lm);
|
||||||
|
|
||||||
if (ctx->program->wave_size == 32) {
|
if (ctx->program->wave_size == 32) {
|
||||||
Operand mask_lo = mask.id() ? Operand(mask) : Operand(-1u);
|
Operand mask_lo = mask.isUndefined() ? Operand(-1u) : mask;
|
||||||
return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
|
return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
|
||||||
}
|
}
|
||||||
|
|
||||||
Operand mask_lo(-1u);
|
Operand mask_lo(-1u);
|
||||||
Operand mask_hi(-1u);
|
Operand mask_hi(-1u);
|
||||||
|
|
||||||
if (mask.id()) {
|
if (mask.isTemp()) {
|
||||||
Builder::Result mask_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), mask);
|
Builder::Result mask_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), mask);
|
||||||
mask_lo = Operand(mask_split.def(0).getTemp());
|
mask_lo = Operand(mask_split.def(0).getTemp());
|
||||||
mask_hi = Operand(mask_split.def(1).getTemp());
|
mask_hi = Operand(mask_split.def(1).getTemp());
|
||||||
|
} else if (mask.physReg() == exec) {
|
||||||
|
mask_lo = Operand(exec_lo, s1);
|
||||||
|
mask_hi = Operand(exec_hi, s1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
|
Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
|
||||||
@@ -7112,7 +7116,7 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
|
|||||||
else
|
else
|
||||||
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
|
||||||
|
|
||||||
Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), tmp);
|
Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
|
||||||
|
|
||||||
Definition cmp_def = Definition();
|
Definition cmp_def = Definition();
|
||||||
if (op == nir_op_iand)
|
if (op == nir_op_iand)
|
||||||
@@ -7145,10 +7149,36 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
|
|||||||
return Temp();
|
return Temp();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ReduceOp get_reduce_op(nir_op op, unsigned bit_size)
|
||||||
|
{
|
||||||
|
switch (op) {
|
||||||
|
#define CASEI(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64;
|
||||||
|
#define CASEF(name) case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
|
||||||
|
CASEI(iadd)
|
||||||
|
CASEI(imul)
|
||||||
|
CASEI(imin)
|
||||||
|
CASEI(umin)
|
||||||
|
CASEI(imax)
|
||||||
|
CASEI(umax)
|
||||||
|
CASEI(iand)
|
||||||
|
CASEI(ior)
|
||||||
|
CASEI(ixor)
|
||||||
|
CASEF(fadd)
|
||||||
|
CASEF(fmul)
|
||||||
|
CASEF(fmin)
|
||||||
|
CASEF(fmax)
|
||||||
|
default:
|
||||||
|
unreachable("unknown reduction op");
|
||||||
|
#undef CASEI
|
||||||
|
#undef CASEF
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
|
void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
|
||||||
{
|
{
|
||||||
Builder bld(ctx->program, ctx->block);
|
Builder bld(ctx->program, ctx->block);
|
||||||
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
|
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
|
||||||
|
assert(dst.regClass().type() != RegType::vgpr);
|
||||||
if (src.regClass().type() == RegType::vgpr) {
|
if (src.regClass().type() == RegType::vgpr) {
|
||||||
bld.pseudo(aco_opcode::p_as_uniform, dst, src);
|
bld.pseudo(aco_opcode::p_as_uniform, dst, src);
|
||||||
} else if (src.regClass() == s1) {
|
} else if (src.regClass() == s1) {
|
||||||
@@ -7160,6 +7190,145 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, nir_src src, Temp count)
|
||||||
|
{
|
||||||
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
Temp src_tmp = get_ssa_temp(ctx, src.ssa);
|
||||||
|
|
||||||
|
if (op == nir_op_fadd) {
|
||||||
|
src_tmp = as_vgpr(ctx, src_tmp);
|
||||||
|
Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
|
||||||
|
|
||||||
|
if (src.ssa->bit_size == 16) {
|
||||||
|
count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
|
||||||
|
bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
|
||||||
|
} else {
|
||||||
|
assert(src.ssa->bit_size == 32);
|
||||||
|
count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
|
||||||
|
bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tmp != dst.getTemp())
|
||||||
|
bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst.regClass() == s1)
|
||||||
|
src_tmp = bld.as_uniform(src_tmp);
|
||||||
|
|
||||||
|
if (op == nir_op_ixor && count.type() == RegType::sgpr)
|
||||||
|
count = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
|
||||||
|
count, Operand(1u));
|
||||||
|
else if (op == nir_op_ixor)
|
||||||
|
count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), count);
|
||||||
|
|
||||||
|
assert(dst.getTemp().type() == count.type());
|
||||||
|
|
||||||
|
if (nir_src_is_const(src)) {
|
||||||
|
if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
|
||||||
|
bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand(0u));
|
||||||
|
else if (nir_src_as_uint(src) == 1)
|
||||||
|
bld.copy(dst, count);
|
||||||
|
else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
|
||||||
|
bld.vop1(aco_opcode::v_mov_b32, dst, Operand(0u)); /* RA will use SDWA if possible */
|
||||||
|
else if (nir_src_as_uint(src) == 0)
|
||||||
|
bld.copy(dst, Operand(0u));
|
||||||
|
else if (count.type() == RegType::vgpr)
|
||||||
|
bld.v_mul_imm(dst, count, nir_src_as_uint(src));
|
||||||
|
else
|
||||||
|
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
|
||||||
|
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
|
||||||
|
bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
|
||||||
|
} else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
|
||||||
|
bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
|
||||||
|
} else if (dst.getTemp().type() == RegType::vgpr) {
|
||||||
|
bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
|
||||||
|
} else {
|
||||||
|
bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool emit_uniform_reduce(isel_context *ctx, nir_intrinsic_instr *instr)
|
||||||
|
{
|
||||||
|
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
||||||
|
if (op == nir_op_imul || op == nir_op_fmul)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
|
||||||
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
|
||||||
|
unsigned bit_size = instr->src[0].ssa->bit_size;
|
||||||
|
if (bit_size > 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Temp thread_count = bld.sop1(
|
||||||
|
Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
|
||||||
|
|
||||||
|
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
|
||||||
|
} else {
|
||||||
|
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool emit_uniform_scan(isel_context *ctx, nir_intrinsic_instr *instr)
|
||||||
|
{
|
||||||
|
Builder bld(ctx->program, ctx->block);
|
||||||
|
Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
|
||||||
|
nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
|
||||||
|
bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
|
||||||
|
|
||||||
|
if (op == nir_op_imul || op == nir_op_fmul)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
|
||||||
|
if (instr->src[0].ssa->bit_size > 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Temp packed_tid;
|
||||||
|
if (inc)
|
||||||
|
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand(1u));
|
||||||
|
else
|
||||||
|
packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
|
||||||
|
|
||||||
|
emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(op == nir_op_imin || op == nir_op_umin ||
|
||||||
|
op == nir_op_imax || op == nir_op_umax ||
|
||||||
|
op == nir_op_iand || op == nir_op_ior ||
|
||||||
|
op == nir_op_fmin || op == nir_op_fmax);
|
||||||
|
|
||||||
|
if (inc) {
|
||||||
|
emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy the source and write the reduction operation identity to the first
|
||||||
|
* lane. */
|
||||||
|
Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
|
||||||
|
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
||||||
|
ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
|
||||||
|
if (dst.bytes() == 8) {
|
||||||
|
Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
|
||||||
|
bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
|
||||||
|
uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
|
||||||
|
uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
|
||||||
|
|
||||||
|
lo = bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand(identity_lo)), lane, lo);
|
||||||
|
hi = bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand(identity_hi)), lane, hi);
|
||||||
|
bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
|
||||||
|
} else {
|
||||||
|
uint32_t identity = get_reduction_identity(reduce_op, 0);
|
||||||
|
bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand(identity)), lane, as_vgpr(ctx, src));
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
Pseudo_reduction_instruction *create_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, Definition dst, Temp src)
|
Pseudo_reduction_instruction *create_reduction_instr(isel_context *ctx, aco_opcode aco_op, ReduceOp op, Definition dst, Temp src)
|
||||||
{
|
{
|
||||||
assert(src.bytes() <= 8);
|
assert(src.bytes() <= 8);
|
||||||
@@ -7760,9 +7929,24 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
nir_intrinsic_cluster_size(instr) : 0;
|
nir_intrinsic_cluster_size(instr) : 0;
|
||||||
cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
|
cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
|
||||||
|
|
||||||
if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
|
if (!nir_src_is_divergent(instr->src[0]) &&
|
||||||
emit_uniform_subgroup(ctx, instr, src);
|
cluster_size == ctx->program->wave_size && instr->dest.ssa.bit_size != 1) {
|
||||||
} else if (instr->dest.ssa.bit_size == 1) {
|
/* We use divergence analysis to assign the regclass, so check if it's
|
||||||
|
* working as expected */
|
||||||
|
ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
|
||||||
|
if (instr->intrinsic == nir_intrinsic_inclusive_scan)
|
||||||
|
expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
|
||||||
|
assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
|
||||||
|
|
||||||
|
if (instr->intrinsic == nir_intrinsic_reduce) {
|
||||||
|
if (emit_uniform_reduce(ctx, instr))
|
||||||
|
break;
|
||||||
|
} else if (emit_uniform_scan(ctx, instr)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (instr->dest.ssa.bit_size == 1) {
|
||||||
if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
|
if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
|
||||||
op = nir_op_iand;
|
op = nir_op_iand;
|
||||||
else if (op == nir_op_iadd)
|
else if (op == nir_op_iadd)
|
||||||
@@ -7791,28 +7975,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
|
|
||||||
src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
|
src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
|
||||||
|
|
||||||
ReduceOp reduce_op;
|
ReduceOp reduce_op = get_reduce_op(op, bit_size);
|
||||||
switch (op) {
|
|
||||||
#define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
|
|
||||||
#define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
|
|
||||||
CASEI(iadd)
|
|
||||||
CASEI(imul)
|
|
||||||
CASEI(imin)
|
|
||||||
CASEI(umin)
|
|
||||||
CASEI(imax)
|
|
||||||
CASEI(umax)
|
|
||||||
CASEI(iand)
|
|
||||||
CASEI(ior)
|
|
||||||
CASEI(ixor)
|
|
||||||
CASEF(fadd)
|
|
||||||
CASEF(fmul)
|
|
||||||
CASEF(fmin)
|
|
||||||
CASEF(fmax)
|
|
||||||
default:
|
|
||||||
unreachable("unknown reduction op");
|
|
||||||
#undef CASEI
|
|
||||||
#undef CASEF
|
|
||||||
}
|
|
||||||
|
|
||||||
aco_opcode aco_op;
|
aco_opcode aco_op;
|
||||||
switch (instr->intrinsic) {
|
switch (instr->intrinsic) {
|
||||||
@@ -8026,7 +8189,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||||||
case nir_intrinsic_mbcnt_amd: {
|
case nir_intrinsic_mbcnt_amd: {
|
||||||
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
|
||||||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||||
Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), src);
|
Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src));
|
||||||
emit_wqm(ctx, wqm_tmp, dst);
|
emit_wqm(ctx, wqm_tmp, dst);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -11082,7 +11245,7 @@ std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp s
|
|||||||
|
|
||||||
/* Subgroup reduction and exclusive scan on the per-lane boolean. */
|
/* Subgroup reduction and exclusive scan on the per-lane boolean. */
|
||||||
Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask);
|
Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask);
|
||||||
Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), src_mask);
|
Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), Operand(src_mask));
|
||||||
|
|
||||||
if (ctx->program->workgroup_size <= ctx->program->wave_size)
|
if (ctx->program->workgroup_size <= ctx->program->wave_size)
|
||||||
return std::make_pair(sg_reduction, sg_excl);
|
return std::make_pair(sg_reduction, sg_excl);
|
||||||
|
@@ -327,4 +327,83 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
|
||||||
|
{
|
||||||
|
switch (op) {
|
||||||
|
case iadd8:
|
||||||
|
case iadd16:
|
||||||
|
case iadd32:
|
||||||
|
case iadd64:
|
||||||
|
case fadd16:
|
||||||
|
case fadd32:
|
||||||
|
case fadd64:
|
||||||
|
case ior8:
|
||||||
|
case ior16:
|
||||||
|
case ior32:
|
||||||
|
case ior64:
|
||||||
|
case ixor8:
|
||||||
|
case ixor16:
|
||||||
|
case ixor32:
|
||||||
|
case ixor64:
|
||||||
|
case umax8:
|
||||||
|
case umax16:
|
||||||
|
case umax32:
|
||||||
|
case umax64:
|
||||||
|
return 0;
|
||||||
|
case imul8:
|
||||||
|
case imul16:
|
||||||
|
case imul32:
|
||||||
|
case imul64:
|
||||||
|
return idx ? 0 : 1;
|
||||||
|
case fmul16:
|
||||||
|
return 0x3c00u; /* 1.0 */
|
||||||
|
case fmul32:
|
||||||
|
return 0x3f800000u; /* 1.0 */
|
||||||
|
case fmul64:
|
||||||
|
return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
||||||
|
case imin8:
|
||||||
|
return INT8_MAX;
|
||||||
|
case imin16:
|
||||||
|
return INT16_MAX;
|
||||||
|
case imin32:
|
||||||
|
return INT32_MAX;
|
||||||
|
case imin64:
|
||||||
|
return idx ? 0x7fffffffu : 0xffffffffu;
|
||||||
|
case imax8:
|
||||||
|
return INT8_MIN;
|
||||||
|
case imax16:
|
||||||
|
return INT16_MIN;
|
||||||
|
case imax32:
|
||||||
|
return INT32_MIN;
|
||||||
|
case imax64:
|
||||||
|
return idx ? 0x80000000u : 0;
|
||||||
|
case umin8:
|
||||||
|
case umin16:
|
||||||
|
case iand8:
|
||||||
|
case iand16:
|
||||||
|
return 0xffffffffu;
|
||||||
|
case umin32:
|
||||||
|
case umin64:
|
||||||
|
case iand32:
|
||||||
|
case iand64:
|
||||||
|
return 0xffffffffu;
|
||||||
|
case fmin16:
|
||||||
|
return 0x7c00u; /* infinity */
|
||||||
|
case fmin32:
|
||||||
|
return 0x7f800000u; /* infinity */
|
||||||
|
case fmin64:
|
||||||
|
return idx ? 0x7ff00000u : 0u; /* infinity */
|
||||||
|
case fmax16:
|
||||||
|
return 0xfc00u; /* negative infinity */
|
||||||
|
case fmax32:
|
||||||
|
return 0xff800000u; /* negative infinity */
|
||||||
|
case fmax64:
|
||||||
|
return idx ? 0xfff00000u : 0u; /* negative infinity */
|
||||||
|
default:
|
||||||
|
unreachable("Invalid reduction operation");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -1384,6 +1384,8 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr);
|
|||||||
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
||||||
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
|
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
|
||||||
|
|
||||||
|
uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
|
||||||
|
|
||||||
enum block_kind {
|
enum block_kind {
|
||||||
/* uniform indicates that leaving this block,
|
/* uniform indicates that leaving this block,
|
||||||
* all actives lanes stay active */
|
* all actives lanes stay active */
|
||||||
|
@@ -474,85 +474,6 @@ void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
|
|
||||||
{
|
|
||||||
switch (op) {
|
|
||||||
case iadd8:
|
|
||||||
case iadd16:
|
|
||||||
case iadd32:
|
|
||||||
case iadd64:
|
|
||||||
case fadd16:
|
|
||||||
case fadd32:
|
|
||||||
case fadd64:
|
|
||||||
case ior8:
|
|
||||||
case ior16:
|
|
||||||
case ior32:
|
|
||||||
case ior64:
|
|
||||||
case ixor8:
|
|
||||||
case ixor16:
|
|
||||||
case ixor32:
|
|
||||||
case ixor64:
|
|
||||||
case umax8:
|
|
||||||
case umax16:
|
|
||||||
case umax32:
|
|
||||||
case umax64:
|
|
||||||
return 0;
|
|
||||||
case imul8:
|
|
||||||
case imul16:
|
|
||||||
case imul32:
|
|
||||||
case imul64:
|
|
||||||
return idx ? 0 : 1;
|
|
||||||
case fmul16:
|
|
||||||
return 0x3c00u; /* 1.0 */
|
|
||||||
case fmul32:
|
|
||||||
return 0x3f800000u; /* 1.0 */
|
|
||||||
case fmul64:
|
|
||||||
return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
|
||||||
case imin8:
|
|
||||||
return INT8_MAX;
|
|
||||||
case imin16:
|
|
||||||
return INT16_MAX;
|
|
||||||
case imin32:
|
|
||||||
return INT32_MAX;
|
|
||||||
case imin64:
|
|
||||||
return idx ? 0x7fffffffu : 0xffffffffu;
|
|
||||||
case imax8:
|
|
||||||
return INT8_MIN;
|
|
||||||
case imax16:
|
|
||||||
return INT16_MIN;
|
|
||||||
case imax32:
|
|
||||||
return INT32_MIN;
|
|
||||||
case imax64:
|
|
||||||
return idx ? 0x80000000u : 0;
|
|
||||||
case umin8:
|
|
||||||
case umin16:
|
|
||||||
case iand8:
|
|
||||||
case iand16:
|
|
||||||
return 0xffffffffu;
|
|
||||||
case umin32:
|
|
||||||
case umin64:
|
|
||||||
case iand32:
|
|
||||||
case iand64:
|
|
||||||
return 0xffffffffu;
|
|
||||||
case fmin16:
|
|
||||||
return 0x7c00u; /* infinity */
|
|
||||||
case fmin32:
|
|
||||||
return 0x7f800000u; /* infinity */
|
|
||||||
case fmin64:
|
|
||||||
return idx ? 0x7ff00000u : 0u; /* infinity */
|
|
||||||
case fmax16:
|
|
||||||
return 0xfc00u; /* negative infinity */
|
|
||||||
case fmax32:
|
|
||||||
return 0xff800000u; /* negative infinity */
|
|
||||||
case fmax64:
|
|
||||||
return idx ? 0xfff00000u : 0u; /* negative infinity */
|
|
||||||
default:
|
|
||||||
unreachable("Invalid reduction operation");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
|
void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
|
||||||
{
|
{
|
||||||
for (unsigned i = 0; i < size; i++) {
|
for (unsigned i = 0; i < size; i++) {
|
||||||
|
Reference in New Issue
Block a user