From 9bb10b58f33668bd6c5b5210b29a9a6d2f3da9bc Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Wed, 24 Apr 2024 08:41:25 +0200 Subject: [PATCH] aco: use v_cvt_pk_u8_f32 for f2u8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi31: Totals from 42 (0.05% of 79395) affected shaders: Instrs: 3253747 -> 3248867 (-0.15%); split: -0.15%, +0.00% CodeSize: 16690136 -> 16661772 (-0.17%); split: -0.17%, +0.00% VGPRs: 4176 -> 4128 (-1.15%) Latency: 18485157 -> 18479752 (-0.03%); split: -0.03%, +0.00% InvThroughput: 3659404 -> 3658222 (-0.03%); split: -0.03%, +0.00% Copies: 231891 -> 228145 (-1.62%) VALU: 1785800 -> 1782054 (-0.21%) Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 6 +++++- src/amd/compiler/aco_ir.cpp | 3 ++- src/amd/compiler/aco_lower_to_hw_instr.cpp | 8 ++++++++ src/amd/compiler/aco_opcodes.py | 1 + src/amd/compiler/aco_register_allocation.cpp | 5 ++++- src/amd/compiler/aco_validate.cpp | 4 ++++ 6 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b0ec7c5c098..0bdc0ce236c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3264,7 +3264,11 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } } } else if (instr->src[0].src.ssa->bit_size == 32) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); + if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11) + bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst), + get_alu_src(ctx, instr->src[0])); + else + emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); } else { emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); } diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 35ba6691139..c3c76f3e3cc 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -433,7 +433,8 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr& instr, bool dpp instr->opcode != aco_opcode::v_permlanex16_b32 && instr->opcode != aco_opcode::v_permlane64_b32 && instr->opcode != aco_opcode::v_readlane_b32_e64 && - instr->opcode != aco_opcode::v_writelane_b32_e64; + instr->opcode != aco_opcode::v_writelane_b32_e64 && + instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32; } aco_ptr diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 6d0876f9bb1..105a18fe113 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2990,6 +2990,14 @@ lower_to_hw_instr(Program* program) ctx.instructions.emplace_back(std::move(instr)); emit_set_mode(bld, block->fp_mode, set_round, false); + } else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) { + Definition def = instr->definitions[0]; + VALU_instruction& valu = + bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0], + Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1)) + ->valu(); + valu.abs = instr->valu().abs; + valu.neg = instr->valu().neg; } else if (instr->isMIMG() && instr->mimg().strict_wqm) { lower_image_sample(&ctx, instr); ctx.instructions.emplace_back(std::move(instr)); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index c01b28c7773..6e37ee6fad6 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1283,6 +1283,7 @@ VOP3 = { ("v_sad_u16", False, False, dst(1), src(1, 1, 1), op(0x15c, gfx8=0x1db, gfx10=0x15c, gfx11=0x224)), ("v_sad_u32", False, False, dst(1), src(1, 1, 1), op(0x15d, gfx8=0x1dc, gfx10=0x15d, gfx11=0x225)), ("v_cvt_pk_u8_f32", True, False, dst(1), src(1, 1, 1), op(0x15e, gfx8=0x1dd, gfx10=0x15e, gfx11=0x226)), + ("p_v_cvt_pk_u8_f32", True, False, dst(1), src(1), op(-1)), ("v_div_fixup_f32", True, True, dst(1), src(1, 1, 1), op(0x15f, gfx8=0x1de, gfx10=0x15f, gfx11=0x227)), ("v_div_fixup_f64", True, True, dst(2), src(2, 2, 2), op(0x160, gfx8=0x1df, gfx10=0x160, gfx11=0x228)), ("v_lshl_b64", False, False, dst(2), src(2, 1), op(0x161, gfx8=-1), InstrClass.Valu64), diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 7a28515e602..fce5fc927e4 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -622,7 +622,7 @@ get_subdword_definition_info(Program* program, const aco_ptr& instr if (instr->isVALU()) { assert(rc.bytes() <= 2); - if (can_use_SDWA(gfx_level, instr, false)) + if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) return std::make_pair(rc.bytes(), rc.bytes()); unsigned bytes_written = 4u; @@ -693,6 +693,9 @@ add_subdword_definition(Program* program, aco_ptr& instr, PhysReg r amd_gfx_level gfx_level = program->gfx_level; assert(instr->definitions[0].bytes() <= 2); + if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) + return; + if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode)) return; diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 42fb5472d8d..09567ac20be 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -1253,6 +1253,7 @@ validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr case aco_opcode::global_load_short_d16_hi: case aco_opcode::ds_read_u8_d16_hi: case aco_opcode::ds_read_u16_d16_hi: return byte == 2; + case aco_opcode::p_v_cvt_pk_u8_f32: return true; default: break; } @@ -1269,6 +1270,9 @@ get_subdword_bytes_written(Program* program, const aco_ptr& instr, return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u; if (instr->isVALU() || instr->isVINTRP()) { assert(def.bytes() <= 2); + if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) + return 1; + if (instr->isSDWA()) return instr->sdwa().dst_sel.size();