aco: use v_cvt_pk_u8_f32 for f2u8

Foz-DB Navi31:
Totals from 42 (0.05% of 79395) affected shaders:
Instrs: 3253747 -> 3248867 (-0.15%); split: -0.15%, +0.00%
CodeSize: 16690136 -> 16661772 (-0.17%); split: -0.17%, +0.00%
VGPRs: 4176 -> 4128 (-1.15%)
Latency: 18485157 -> 18479752 (-0.03%); split: -0.03%, +0.00%
InvThroughput: 3659404 -> 3658222 (-0.03%); split: -0.03%, +0.00%
Copies: 231891 -> 228145 (-1.62%)
VALU: 1785800 -> 1782054 (-0.21%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29532>
This commit is contained in:
Georg Lehmann
2024-04-24 08:41:25 +02:00
committed by Marge Bot
parent 46ad5a01a8
commit 9bb10b58f3
6 changed files with 24 additions and 3 deletions

View File

@@ -3264,7 +3264,11 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
}
}
} else if (instr->src[0].src.ssa->bit_size == 32) {
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
get_alu_src(ctx, instr->src[0]));
else
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
} else {
emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
}

View File

@@ -433,7 +433,8 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp
instr->opcode != aco_opcode::v_permlanex16_b32 &&
instr->opcode != aco_opcode::v_permlane64_b32 &&
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
instr->opcode != aco_opcode::v_writelane_b32_e64;
instr->opcode != aco_opcode::v_writelane_b32_e64 &&
instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
}
aco_ptr<Instruction>

View File

@@ -2990,6 +2990,14 @@ lower_to_hw_instr(Program* program)
ctx.instructions.emplace_back(std::move(instr));
emit_set_mode(bld, block->fp_mode, set_round, false);
} else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) {
Definition def = instr->definitions[0];
VALU_instruction& valu =
bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0],
Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1))
->valu();
valu.abs = instr->valu().abs;
valu.neg = instr->valu().neg;
} else if (instr->isMIMG() && instr->mimg().strict_wqm) {
lower_image_sample(&ctx, instr);
ctx.instructions.emplace_back(std::move(instr));

View File

@@ -1283,6 +1283,7 @@ VOP3 = {
("v_sad_u16", False, False, dst(1), src(1, 1, 1), op(0x15c, gfx8=0x1db, gfx10=0x15c, gfx11=0x224)),
("v_sad_u32", False, False, dst(1), src(1, 1, 1), op(0x15d, gfx8=0x1dc, gfx10=0x15d, gfx11=0x225)),
("v_cvt_pk_u8_f32", True, False, dst(1), src(1, 1, 1), op(0x15e, gfx8=0x1dd, gfx10=0x15e, gfx11=0x226)),
("p_v_cvt_pk_u8_f32", True, False, dst(1), src(1), op(-1)),
("v_div_fixup_f32", True, True, dst(1), src(1, 1, 1), op(0x15f, gfx8=0x1de, gfx10=0x15f, gfx11=0x227)),
("v_div_fixup_f64", True, True, dst(2), src(2, 2, 2), op(0x160, gfx8=0x1df, gfx10=0x160, gfx11=0x228)),
("v_lshl_b64", False, False, dst(2), src(2, 1), op(0x161, gfx8=-1), InstrClass.Valu64),

View File

@@ -622,7 +622,7 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
if (instr->isVALU()) {
assert(rc.bytes() <= 2);
if (can_use_SDWA(gfx_level, instr, false))
if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
return std::make_pair(rc.bytes(), rc.bytes());
unsigned bytes_written = 4u;
@@ -693,6 +693,9 @@ add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg r
amd_gfx_level gfx_level = program->gfx_level;
assert(instr->definitions[0].bytes() <= 2);
if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
return;
if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
return;

View File

@@ -1253,6 +1253,7 @@ validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr<Instruction>
case aco_opcode::global_load_short_d16_hi:
case aco_opcode::ds_read_u8_d16_hi:
case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
case aco_opcode::p_v_cvt_pk_u8_f32: return true;
default: break;
}
@@ -1269,6 +1270,9 @@ get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr,
return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
if (instr->isVALU() || instr->isVINTRP()) {
assert(def.bytes() <= 2);
if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
return 1;
if (instr->isSDWA())
return instr->sdwa().dst_sel.size();