From 6baad09711f40d022975ec6b4c637bb32e6e5194 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 9 Mar 2021 16:09:15 +0000 Subject: [PATCH] aco: use saddr for global access with sgpr address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fossil-db (Sienna Cichlid): Totals from 38 (0.03% of 134621) affected shaders: CodeSize: 237196 -> 237060 (-0.06%); split: -0.09%, +0.03% Instrs: 43895 -> 43894 (-0.00%); split: -0.02%, +0.01% Latency: 914633 -> 916263 (+0.18%); split: -0.01%, +0.19% InvThroughput: 468215 -> 468971 (+0.16%); split: -0.02%, +0.18% SClause: 1239 -> 1242 (+0.24%) PreSGPRs: 997 -> 1003 (+0.60%) PreVGPRs: 936 -> 923 (-1.39%); split: -1.50%, +0.11% Regression seems to be RA noise, creating a waitcnt. Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- .../compiler/aco_instruction_selection.cpp | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 887591c321c..097c1c68567 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -271,16 +271,21 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) } Temp -as_vgpr(isel_context* ctx, Temp val) +as_vgpr(Builder& bld, Temp val) { - if (val.type() == RegType::sgpr) { - Builder bld(ctx->program, ctx->block); + if (val.type() == RegType::sgpr) return bld.copy(bld.def(RegType::vgpr, val.size()), val); - } assert(val.type() == RegType::vgpr); return val; } +Temp +as_vgpr(isel_context* ctx, Temp val) +{ + Builder bld(ctx->program, ctx->block); + return as_vgpr(bld, val); +} + // assumes a != 0xffffffff void emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b) @@ -4306,12 +4311,15 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); } else { - offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset; - aco_ptr flat{ create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; - flat->operands[0] = Operand(offset); - flat->operands[1] = Operand(s1); + if (global && offset.regClass() == s2) { + flat->operands[0] = bld.copy(bld.def(v1), Operand::zero()); + flat->operands[1] = Operand(offset); + } else { + flat->operands[0] = Operand(as_vgpr(bld, offset)); + flat->operands[1] = Operand(s1); + } flat->glc = info.glc; flat->dlc = info.glc && bld.program->chip_class >= GFX10; flat->sync = info.sync; @@ -6440,9 +6448,6 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); - if (ctx->options->chip_class >= GFX7) - addr = as_vgpr(ctx, addr); - unsigned write_count = 0; Temp write_datas[32]; unsigned offsets[32]; @@ -6491,8 +6496,13 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) aco_ptr flat{ create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; - flat->operands[0] = Operand(store_addr); - flat->operands[1] = Operand(s1); + if (global && store_addr.regClass() == s2) { + flat->operands[0] = bld.copy(bld.def(v1), Operand::zero()); + flat->operands[1] = Operand(store_addr); + } else { + flat->operands[0] = Operand(as_vgpr(ctx, store_addr)); + flat->operands[1] = Operand(s1); + } flat->operands[2] = Operand(write_datas[i]); flat->glc = glc; flat->dlc = false; @@ -6534,9 +6544,6 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); - if (ctx->options->chip_class >= GFX7) - addr = as_vgpr(ctx, addr); - if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), get_ssa_temp(ctx, instr->src[2].ssa), data); @@ -6604,8 +6611,13 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; aco_ptr flat{create_instruction( op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; - flat->operands[0] = Operand(addr); - flat->operands[1] = Operand(s1); + if (global && addr.regClass() == s2) { + flat->operands[0] = bld.copy(bld.def(v1), Operand::zero()); + flat->operands[1] = Operand(addr); + } else { + flat->operands[0] = Operand(as_vgpr(ctx, addr)); + flat->operands[1] = Operand(s1); + } flat->operands[2] = Operand(data); if (return_previous) flat->definitions[0] = Definition(dst);