From 9169fbf83c4dfd31380e3cc1028ca698246d1e0a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 15 Aug 2023 21:01:49 +0100 Subject: [PATCH] aco: clarify bpermute pseudo opcode names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 6 +++--- src/amd/compiler/aco_lower_to_hw_instr.cpp | 18 +++++++++--------- src/amd/compiler/aco_opcodes.py | 12 ++++++------ src/amd/compiler/aco_optimizer.cpp | 6 +++--- src/amd/compiler/aco_reduce_assign.cpp | 6 +++--- src/amd/compiler/aco_validate.cpp | 2 +- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index ab7b1891cd2..8521c658412 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -208,7 +208,7 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) index_op.setLateKill(true); input_data.setLateKill(true); - return bld.pseudo(aco_opcode::p_bpermute_gfx6, bld.def(v1), bld.def(bld.lm), + return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data); } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) { @@ -234,10 +234,10 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) */ ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; - return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2), + return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); } else { - return bld.pseudo(aco_opcode::p_bpermute_gfx11w64, bld.def(v1), bld.def(s2), + return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2), bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data, same_half); } diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index f34ca358b6d..370ab27f091 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -932,7 +932,7 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c } void -emit_gfx11_wave64_bpermute(Program* program, aco_ptr& instr, Builder& bld) +emit_bpermute_permlane(Program* program, aco_ptr& instr, Builder& bld) { /* Emulates proper bpermute on GFX11 in wave64 mode. * @@ -993,7 +993,7 @@ emit_gfx11_wave64_bpermute(Program* program, aco_ptr& instr, Builde } void -emit_gfx10_wave64_bpermute(Program* program, aco_ptr& instr, Builder& bld) +emit_bpermute_shared_vgpr(Program* program, aco_ptr& instr, Builder& bld) { /* Emulates proper bpermute on GFX10 in wave64 mode. * @@ -1072,7 +1072,7 @@ emit_gfx10_wave64_bpermute(Program* program, aco_ptr& instr, Builde } void -emit_gfx6_bpermute(Program* program, aco_ptr& instr, Builder& bld) +emit_bpermute_readlane(Program* program, aco_ptr& instr, Builder& bld) { /* Emulates bpermute using readlane instructions */ @@ -2555,16 +2555,16 @@ lower_to_hw_instr(Program* program) Operand(pops_exiting_wave_id, s1), instr->operands[0]); break; } - case aco_opcode::p_bpermute_gfx6: { - emit_gfx6_bpermute(program, instr, bld); + case aco_opcode::p_bpermute_readlane: { + emit_bpermute_readlane(program, instr, bld); break; } - case aco_opcode::p_bpermute_gfx10w64: { - emit_gfx10_wave64_bpermute(program, instr, bld); + case aco_opcode::p_bpermute_shared_vgpr: { + emit_bpermute_shared_vgpr(program, instr, bld); break; } - case aco_opcode::p_bpermute_gfx11w64: { - emit_gfx11_wave64_bpermute(program, instr, bld); + case aco_opcode::p_bpermute_permlane: { + emit_bpermute_permlane(program, instr, bld); break; } case aco_opcode::p_constaddr: { diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index c6766934aa4..137a9224e1f 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -334,20 +334,20 @@ opcode("p_demote_to_helper") opcode("p_is_helper") opcode("p_exit_early_if") -# simulates proper bpermute behavior on GFX6 +# simulates proper bpermute behavior using v_readlane_b32 # definitions: result VGPR, temp EXEC, clobbered VCC # operands: index, input data -opcode("p_bpermute_gfx6") +opcode("p_bpermute_readlane") -# simulates proper bpermute behavior on GFX10 +# simulates proper wave64 bpermute behavior using shared vgprs (for GFX10/10.3) # definitions: result VGPR, temp EXEC, clobbered SCC # operands: index * 4, input data, same half (bool) -opcode("p_bpermute_gfx10w64") +opcode("p_bpermute_shared_vgpr") -# simulates proper bpermute behavior on GFX11 +# simulates proper wave64 bpermute behavior using v_permlane64_b32 (for GFX11+) # definitions: result VGPR, temp EXEC, clobbered SCC # operands: linear VGPR, index * 4, input data, same half (bool) -opcode("p_bpermute_gfx11w64") +opcode("p_bpermute_permlane") # creates a lane mask where only the first active lane is selected opcode("p_elect") diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 16e0450bf08..4c03cacd761 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -670,9 +670,9 @@ alu_can_accept_constant(const aco_ptr& instr, unsigned operand) case aco_opcode::v_readfirstlane_b32: case aco_opcode::p_extract: case aco_opcode::p_insert: return operand != 0; - case aco_opcode::p_bpermute_gfx6: - case aco_opcode::p_bpermute_gfx10w64: - case aco_opcode::p_bpermute_gfx11w64: + case aco_opcode::p_bpermute_readlane: + case aco_opcode::p_bpermute_shared_vgpr: + case aco_opcode::p_bpermute_permlane: case aco_opcode::p_interp_gfx11: case aco_opcode::p_dual_src_export_gfx11: case aco_opcode::v_interp_p1_f32: diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 142ce776e01..83514206d46 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -46,7 +46,7 @@ setup_reduce_temp(Program* program) for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) { if (instr->opcode == aco_opcode::p_interp_gfx11 || - instr->opcode == aco_opcode::p_bpermute_gfx11w64) { + instr->opcode == aco_opcode::p_bpermute_permlane) { maxSize = MAX2(maxSize, 1); hasReductions[block.index] = true; } else if (instr->format == Format::PSEUDO_REDUCTION) { @@ -101,7 +101,7 @@ setup_reduce_temp(Program* program) Instruction* instr = (*it).get(); if (instr->format != Format::PSEUDO_REDUCTION && instr->opcode != aco_opcode::p_interp_gfx11 && - instr->opcode != aco_opcode::p_bpermute_gfx11w64) + instr->opcode != aco_opcode::p_bpermute_permlane) continue; if ((int)last_top_level_block_idx != inserted_at) { @@ -173,7 +173,7 @@ setup_reduce_temp(Program* program) instr->operands[2] = Operand(vtmp); } else { assert(instr->opcode == aco_opcode::p_interp_gfx11 || - instr->opcode == aco_opcode::p_bpermute_gfx11w64); + instr->opcode == aco_opcode::p_bpermute_permlane); instr->operands[0] = Operand(reduceTmp); instr->operands[0].setLateKill(true); } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 0aa6767654f..32c3d798dea 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -268,7 +268,7 @@ validate_ir(Program* program) instr->opcode == aco_opcode::p_dual_src_export_gfx11 || instr->opcode == aco_opcode::p_end_with_regs || (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) || - (instr->opcode == aco_opcode::p_bpermute_gfx11w64 && i == 0) || + (instr->opcode == aco_opcode::p_bpermute_permlane && i == 0) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||