From d20a752c0de426e4c7c64a4d42d10f373f73c97a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 15 Oct 2020 15:09:20 +0100 Subject: [PATCH] aco: use Builder::copy more MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Navi): Totals from 6973 (5.07% of 137413) affected shaders: SGPRs: 381768 -> 381776 (+0.00%) VGPRs: 306092 -> 306096 (+0.00%); split: -0.00%, +0.00% CodeSize: 24440844 -> 24421196 (-0.08%); split: -0.09%, +0.01% MaxWaves: 86581 -> 86583 (+0.00%) Instrs: 4682161 -> 4679578 (-0.06%); split: -0.06%, +0.00% Cycles: 68793116 -> 68261648 (-0.77%); split: -0.83%, +0.05% fossil-db (Polaris): Totals from 8154 (5.87% of 138881) affected shaders: VGPRs: 338916 -> 338920 (+0.00%); split: -0.00%, +0.00% CodeSize: 23540428 -> 23540488 (+0.00%); split: -0.00%, +0.00% MaxWaves: 49090 -> 49091 (+0.00%) Instrs: 4576085 -> 4576101 (+0.00%); split: -0.00%, +0.00% Cycles: 51720704 -> 51720888 (+0.00%); split: -0.00%, +0.00% Most of the Navi cycle/instruction changes are from 8/16-bit parallel-rdp shaders. They appear to be improved because the p_create_vector from lower_subdword_phis() was blocking constant propagation. Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 12 +-- .../compiler/aco_instruction_selection.cpp | 92 +++++++------------ src/amd/compiler/aco_lower_phis.cpp | 10 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 23 +++-- 4 files changed, 55 insertions(+), 82 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 30c408a354e..638157ccf2f 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -381,7 +381,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* exec seems to need to be manually initialized with combined shaders */ if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) { - bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); + bld.copy(bld.exec(Definition(exec_mask)), Operand(UINT32_MAX, bld.lm == s2)); instructions[0]->definitions.pop_back(); } @@ -653,15 +653,15 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptroperands[1]; if (need_check) { /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ - Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u)); + Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u, bld.lm == s2)); if (offset.isLiteral()) - offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); + offset = bld.copy(bld.def(s1), offset); offset = bld.sop2(aco_opcode::s_cselect_b32, bld.hint_m0(bld.def(s1)), offset, Operand(UINT32_MAX), bld.scc(nonempty)); } else if (offset.isConstant() && offset.constantValue() > 0xFFFFF) { - offset = bld.sop1(aco_opcode::s_mov_b32, bld.hint_m0(bld.def(s1)), offset); + offset = bld.copy(bld.hint_m0(bld.def(s1)), offset); } if (!offset.isConstant()) offset.setFixed(m0); @@ -1076,7 +1076,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2)); } bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); @@ -1105,7 +1105,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.copy(bld.def(bld.lm, exec), Operand(0u, bld.lm == s2)); } bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 4de0fe69e8a..6b82f8289e9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -279,7 +279,7 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) bool post_shift = info.post_shift != 0; if (!pre_shift && !increment && !multiply && !post_shift) { - bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a); + bld.copy(Definition(dst), a); return; } @@ -299,7 +299,7 @@ void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b) if (multiply) { multiply_dst = post_shift ? bld.tmp(v1) : dst; bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, - bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier))); + bld.copy(bld.def(v1), Operand((uint32_t)info.multiplier))); } if (post_shift) { @@ -1007,7 +1007,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); if (cond.id() == els.id()) - bld.sop1(Builder::s_mov, Definition(dst), then); + bld.copy(Definition(dst), then); else bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); @@ -1200,26 +1200,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_mov: { Temp src = get_alu_src(ctx, instr->src[0]); - aco_ptr mov; - if (dst.type() == RegType::sgpr) { - if (src.type() == RegType::vgpr) - bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); - else if (src.regClass() == s1) - bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); - else if (src.regClass() == s2) - bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src); - else - unreachable("wrong src register class for nir_op_imov"); - } else { - if (dst.regClass() == v1) - bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src); - else if (dst.regClass() == v1b || - dst.regClass() == v2b || - dst.regClass() == v2) - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); - else - unreachable("wrong src register class for nir_op_imov"); - } + if (src.bytes() != dst.bytes()) + unreachable("wrong src or dst register class for nir_op_mov"); + if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) + bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); + else + bld.copy(Definition(dst), src); break; } case nir_op_inot: { @@ -2113,11 +2099,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); } else if (dst.regClass() == v2) { Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); - Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); + Temp tmp = bld.copy(bld.def(v1), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); - tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u)); + tmp = bld.copy(bld.def(v1), Operand(0xBFF00000u)); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); @@ -2492,7 +2478,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) src = bool_to_scalar_condition(ctx, src); bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src)); } else if (dst.regClass() == v2) { - Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u)); + Temp one = bld.copy(bld.def(v2), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper); } else { @@ -2902,13 +2888,12 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) assert(dst.regClass() == bld.lm); int val = instr->value[0].b ? -1 : 0; Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); - bld.sop1(Builder::s_mov, Definition(dst), op); + bld.copy(Definition(dst), op); } else if (instr->def.bit_size == 8) { - /* ensure that the value is correctly represented in the low byte of the register */ - bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8); + bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].u8)); } else if (instr->def.bit_size == 16) { - /* ensure that the value is correctly represented in the low half of the register */ - bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16); + /* sign-extend to use s_movk_i32 instead of a literal */ + bld.copy(Definition(dst), Operand((uint32_t)instr->value[0].i16)); } else if (dst.size() == 1) { bld.copy(Definition(dst), Operand(instr->value[0].u32)); } else { @@ -3209,7 +3194,7 @@ void emit_load(isel_context *ctx, Builder &bld, const LoadEmitInfo &info, Operand load_lds_size_m0(Builder& bld) { /* TODO: m0 does not need to be initialized on GFX9+ */ - return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff)); + return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0xffffffffu))); } Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info, @@ -4601,7 +4586,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) index = bld.vadd32(bld.def(v1), start_instance, instance_id); } } else { - index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance); + index = bld.copy(bld.def(v1), start_instance); } } else { index = bld.vadd32(bld.def(v1), @@ -5582,7 +5567,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); - Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index); + Temp sample_index_v = bld.copy(bld.def(v1), sample_index); /* Replace the MSAA sample index. */ return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare); @@ -5985,7 +5970,7 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) /* LOD */ assert(nir_src_as_uint(instr->src[1]) == 0); - Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + Temp lod = bld.copy(bld.def(v1), Operand(0u)); /* Resource */ Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false); @@ -6802,7 +6787,7 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) { Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u)); - Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples])); + Temp ps_iter_mask = bld.copy(bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples])); Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage)); @@ -7167,15 +7152,10 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s Builder bld(ctx->program, ctx->block); Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); assert(dst.regClass().type() != RegType::vgpr); - if (src.regClass().type() == RegType::vgpr) { + if (src.regClass().type() == RegType::vgpr) bld.pseudo(aco_opcode::p_as_uniform, dst, src); - } else if (src.regClass() == s1) { - bld.sop1(aco_opcode::s_mov_b32, dst, src); - } else if (src.regClass() == s2) { - bld.sop1(aco_opcode::s_mov_b64, dst, src); - } else { - isel_err(&instr->instr, "Unimplemented NIR instr bit size"); - } + else + bld.copy(dst, src); } void emit_addition_uniform_reduce(isel_context *ctx, nir_op op, Definition dst, nir_src src, Temp count) @@ -7768,7 +7748,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16))); } else { - bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u)); } break; } @@ -7781,7 +7761,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), get_arg(ctx, ctx->args->ac.tg_size)); else - bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); + bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u)); break; } case nir_intrinsic_ballot: { @@ -7882,12 +7862,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); - } else if (src.regClass() == s1) { - bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); - } else if (src.regClass() == s2) { - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src); } else { - isel_err(&instr->instr, "Unimplemented NIR instr bit size"); + bld.copy(Definition(dst), src); } break; } @@ -8705,7 +8681,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack); } if (pack_const && pack == Temp()) - offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const)); + offset = bld.copy(bld.def(v1), Operand(pack_const)); else if (pack == Temp()) has_offset = false; else @@ -8809,7 +8785,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) aco_ptr tex; if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { if (!has_lod) - lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + lod = bld.copy(bld.def(v1), Operand(0u)); bool div_by_6 = instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && @@ -8861,7 +8837,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex.reset(create_instruction(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)); tex->operands[0] = Operand(resource); tex->operands[1] = Operand(s4); /* no sampler */ - tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u)); + tex->operands[2] = bld.copy(bld.def(v1), Operand(0u)); tex->dim = dim; tex->dmask = 0x3; tex->da = da; @@ -9259,10 +9235,8 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) if (num_defined == 0) { Builder bld(ctx->program, ctx->block); - if (dst.regClass() == s1) { - bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u)); - } else if (dst.regClass() == v1) { - bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u)); + if (dst.bytes() == 4) { + bld.copy(Definition(dst), Operand(0u)); } else { aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; for (unsigned i = 0; i < dst.size(); i++) @@ -11385,7 +11359,7 @@ void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr) bld.reset(ctx->block); Temp gds_addr = bld.copy(bld.def(v1), Operand(0u)); - Operand m = bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0x100)); + Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u))); bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true); begin_divergent_if_else(ctx, &ic_last_lane); diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp index f21d954e2d6..42e2e8997bf 100644 --- a/src/amd/compiler/aco_lower_phis.cpp +++ b/src/amd/compiler/aco_lower_phis.cpp @@ -128,7 +128,7 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr bld.reset(&block->instructions, std::prev(it.base())); if (prev.isUndefined()) { - bld.sop1(Builder::s_mov, dst, cur); + bld.copy(dst, cur); return; } @@ -150,16 +150,16 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr if (!cur_is_constant) bld.sop2(Builder::s_orn2, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm)); else if (cur.constantValue64(true)) - bld.sop1(Builder::s_mov, dst, program->wave_size == 64 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); + bld.copy(dst, Operand(UINT32_MAX, bld.lm == s2)); else bld.sop1(Builder::s_not, dst, bld.def(s1, scc), Operand(exec, bld.lm)); } else { if (!cur_is_constant) bld.sop2(Builder::s_and, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm)); else if (cur.constantValue64(true)) - bld.sop1(Builder::s_mov, dst, Operand(exec, bld.lm)); + bld.copy(dst, Operand(exec, bld.lm)); else - bld.sop1(Builder::s_mov, dst, program->wave_size == 64 ? Operand((uint64_t)0u) : Operand(0u)); + bld.copy(dst, Operand(0u, bld.lm == s2)); } } @@ -266,7 +266,7 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr& p assert(phi_src.regClass().type() == RegType::sgpr); Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size())); - insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), phi_src).get_ptr()); + insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr()); Temp new_phi_src = bld.tmp(phi->definitions[0].regClass()); insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr()); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 3c4898e0d59..8c6766b17c8 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1003,9 +1003,10 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op) } } + if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8) + op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */ + if (dst.regClass() == s1) { - if (op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8) - op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */ bld.sop1(aco_opcode::s_mov_b32, dst, op); } else if (dst.regClass() == s2) { bld.sop1(aco_opcode::s_mov_b64, dst, op); @@ -1066,8 +1067,14 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool if (def.physReg() == scc) { bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u)); *preserve_scc = true; - } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) { - bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2)); + } else if (op.isConstant()) { + copy_constant(ctx, bld, def, op); + } else if (def.regClass() == v1) { + bld.vop1(aco_opcode::v_mov_b32, def, op); + } else if (def.regClass() == s1) { + bld.sop1(aco_opcode::s_mov_b32, def, op); + } else if (def.regClass() == s2) { + bld.sop1(aco_opcode::s_mov_b64, def, op); } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) { if (op.physReg().byte()) { assert(def.physReg().byte() == 0); @@ -1098,14 +1105,6 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool } else { bld.vop1(aco_opcode::v_mov_b32, def, op); } - } else if (op.isConstant()) { - copy_constant(ctx, bld, def, op); - } else if (def.regClass() == v1) { - bld.vop1(aco_opcode::v_mov_b32, def, op); - } else if (def.regClass() == s1) { - bld.sop1(aco_opcode::s_mov_b32, def, op); - } else if (def.regClass() == s2) { - bld.sop1(aco_opcode::s_mov_b64, def, op); } else if (def.regClass().is_subdword()) { bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op); } else {