nir: intel/compiler: Add and use nir_op_pack_32_4x8_split
A lot of CTS tests write a u8vec4 or an i8vec4 to an SSBO. This results
in a lot of shifts and MOVs. When that pattern can be recognized, the
individual 8-bit components can be packed much more efficiently.
v2: Rebase on b4369de27f
("nir/lower_packing: use
shader_instructions_pass")
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
This commit is contained in:
@@ -3666,6 +3666,9 @@ typedef struct nir_shader_compiler_options {
|
|||||||
* iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
* iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */
|
||||||
bool has_isub;
|
bool has_isub;
|
||||||
|
|
||||||
|
/** Backend supports pack_32_4x8 or pack_32_4x8_split. */
|
||||||
|
bool has_pack_32_4x8;
|
||||||
|
|
||||||
/** Backend supports txs, if not nir_lower_tex(..) uses txs-free variants
|
/** Backend supports txs, if not nir_lower_tex(..) uses txs-free variants
|
||||||
* for rect texture lowering. */
|
* for rect texture lowering. */
|
||||||
bool has_txs;
|
bool has_txs;
|
||||||
|
@@ -86,6 +86,15 @@ lower_unpack_64_to_16(nir_builder *b, nir_ssa_def *src)
|
|||||||
nir_unpack_32_2x16_split_y(b, zw));
|
nir_unpack_32_2x16_split_y(b, zw));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static nir_ssa_def *
|
||||||
|
lower_pack_32_from_8(nir_builder *b, nir_ssa_def *src)
|
||||||
|
{
|
||||||
|
return nir_pack_32_4x8_split(b, nir_channel(b, src, 0),
|
||||||
|
nir_channel(b, src, 1),
|
||||||
|
nir_channel(b, src, 2),
|
||||||
|
nir_channel(b, src, 3));
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
|
lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
|
||||||
{
|
{
|
||||||
@@ -99,8 +108,8 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
|
|||||||
alu_instr->op != nir_op_pack_64_4x16 &&
|
alu_instr->op != nir_op_pack_64_4x16 &&
|
||||||
alu_instr->op != nir_op_unpack_64_4x16 &&
|
alu_instr->op != nir_op_unpack_64_4x16 &&
|
||||||
alu_instr->op != nir_op_pack_32_2x16 &&
|
alu_instr->op != nir_op_pack_32_2x16 &&
|
||||||
alu_instr->op != nir_op_unpack_32_2x16)
|
alu_instr->op != nir_op_unpack_32_2x16 &&
|
||||||
|
alu_instr->op != nir_op_pack_32_4x8)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
b->cursor = nir_before_instr(&alu_instr->instr);
|
b->cursor = nir_before_instr(&alu_instr->instr);
|
||||||
@@ -127,6 +136,9 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
|
|||||||
case nir_op_unpack_32_2x16:
|
case nir_op_unpack_32_2x16:
|
||||||
dest = lower_unpack_32_to_16(b, src);
|
dest = lower_unpack_32_to_16(b, src);
|
||||||
break;
|
break;
|
||||||
|
case nir_op_pack_32_4x8:
|
||||||
|
dest = lower_pack_32_from_8(b, src);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
unreachable("Impossible opcode");
|
unreachable("Impossible opcode");
|
||||||
}
|
}
|
||||||
|
@@ -897,6 +897,10 @@ binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
|
|||||||
binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
|
binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
|
||||||
"src0 | ((uint32_t)src1 << 16)")
|
"src0 | ((uint32_t)src1 << 16)")
|
||||||
|
|
||||||
|
opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
|
||||||
|
False, "",
|
||||||
|
"src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
|
||||||
|
|
||||||
# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
|
# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
|
||||||
# and that of the "bfi1" i965 instruction. That is, the bits and offset values
|
# and that of the "bfi1" i965 instruction. That is, the bits and offset values
|
||||||
# are from the low five bits of src0 and src1, respectively.
|
# are from the low five bits of src0 and src1, respectively.
|
||||||
|
@@ -1313,6 +1313,10 @@ optimizations.extend([
|
|||||||
(('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
|
(('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
|
||||||
(('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
|
(('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
|
||||||
|
|
||||||
|
# Packing a u8vec4 to write to an SSBO.
|
||||||
|
(('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
|
||||||
|
('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
|
||||||
|
|
||||||
(('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
|
(('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
|
||||||
(('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
|
(('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
|
||||||
|
|
||||||
|
@@ -68,6 +68,7 @@
|
|||||||
.lower_usub_sat64 = true, \
|
.lower_usub_sat64 = true, \
|
||||||
.lower_hadd64 = true, \
|
.lower_hadd64 = true, \
|
||||||
.avoid_ternary_with_two_constants = true, \
|
.avoid_ternary_with_two_constants = true, \
|
||||||
|
.has_pack_32_4x8 = true, \
|
||||||
.max_unroll_iterations = 32, \
|
.max_unroll_iterations = 32, \
|
||||||
.force_indirect_unrolling = nir_var_function_temp
|
.force_indirect_unrolling = nir_var_function_temp
|
||||||
|
|
||||||
|
@@ -988,6 +988,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
|||||||
case nir_op_u2u32:
|
case nir_op_u2u32:
|
||||||
case nir_op_iabs:
|
case nir_op_iabs:
|
||||||
case nir_op_ineg:
|
case nir_op_ineg:
|
||||||
|
case nir_op_pack_32_4x8_split:
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
@@ -1721,6 +1722,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
|||||||
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
|
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case nir_op_pack_32_4x8_split:
|
||||||
|
bld.emit(FS_OPCODE_PACK, result, op, 4);
|
||||||
|
break;
|
||||||
|
|
||||||
case nir_op_unpack_64_2x32_split_x:
|
case nir_op_unpack_64_2x32_split_x:
|
||||||
case nir_op_unpack_64_2x32_split_y: {
|
case nir_op_unpack_64_2x32_split_y: {
|
||||||
if (instr->op == nir_op_unpack_64_2x32_split_x)
|
if (instr->op == nir_op_unpack_64_2x32_split_x)
|
||||||
|
Reference in New Issue
Block a user