diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 3b7adc38a6b..083e0ca5620 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -74,6 +74,8 @@ static const struct nir_shader_compiler_options nir_options = { .lower_unpack_half_2x16 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_ffma16 = true, .lower_ffma32 = true, .lower_ffma64 = true, diff --git a/src/asahi/compiler/agx_compile.h b/src/asahi/compiler/agx_compile.h index 70aed6e48a8..5c90e44a144 100644 --- a/src/asahi/compiler/agx_compile.h +++ b/src/asahi/compiler/agx_compile.h @@ -155,6 +155,8 @@ static const nir_shader_compiler_options agx_nir_options = { .lower_fsign = true, .lower_rotate = true, .lower_pack_split = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_uniforms_to_ubo = true, .lower_cs_local_index_from_id = true, diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index a4e1d97dbd7..99d7779038b 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -188,6 +188,8 @@ const nir_shader_compiler_options v3dv_nir_options = { .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_bitfield_insert_to_shifts = true, .lower_bitfield_extract_to_shifts = true, .lower_bitfield_reverse = true, diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 7981031b54a..1bad96e1f37 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3259,6 +3259,8 @@ typedef struct nir_shader_compiler_options { bool lower_extract_byte; bool lower_extract_word; + bool lower_insert_byte; + bool lower_insert_word; bool lower_all_io_to_temps; bool lower_all_io_to_elements; diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index bb8fc5bea86..eda8d11cb3d 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -972,6 +972,10 @@ binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") +# Byte/word insertion +binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)") +binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)") + def triop(name, ty, alg_props, const_expr): opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 6339ebb5079..8dc20390491 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -2429,6 +2429,19 @@ for N in [16, 32]: ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), ]) +# Byte insertion +late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) +late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) +late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) + +late_optimizations += [ + # Word insertion + (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), + + # Extract and then insert + (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), + (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), +] # Integer sizes for s in [8, 16, 32, 64]: diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index a45274b82df..4410cae41de 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -53,6 +53,8 @@ static const nir_shader_compiler_options options = { .vertex_id_zero_based = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_helper_invocation = true, .lower_bitfield_insert_to_shifts = true, .lower_bitfield_extract_to_shifts = true, @@ -107,6 +109,8 @@ static const nir_shader_compiler_options options_a6xx = { .vertex_id_zero_based = false, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_helper_invocation = true, .lower_bitfield_insert_to_shifts = true, .lower_bitfield_extract_to_shifts = true, diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c index 409e9188be8..2b12d802618 100644 --- a/src/gallium/auxiliary/nir/nir_to_tgsi.c +++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c @@ -2659,6 +2659,8 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s) if (!options->lower_extract_byte || !options->lower_extract_word || + !options->lower_insert_byte || + !options->lower_insert_word || !options->lower_fdph || !options->lower_flrp64 || !options->lower_fmod || @@ -2671,6 +2673,8 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s) new_options->lower_extract_byte = true; new_options->lower_extract_word = true; + new_options->lower_insert_byte = true; + new_options->lower_insert_word = true; new_options->lower_fdph = true; new_options->lower_flrp64 = true; new_options->lower_fmod = true; @@ -2835,6 +2839,8 @@ static const nir_shader_compiler_options nir_to_tgsi_compiler_options = { .fuse_ffma64 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_fdph = true, .lower_flrp64 = true, .lower_fmod = true, diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index 467c17f26b9..82a0e152bfc 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -1067,6 +1067,8 @@ etna_screen_create(struct etna_device *dev, struct etna_gpu *gpu, .lower_fmod = true, .lower_vector_cmp = true, .lower_fdph = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_fdiv = true, /* !screen->specs.has_new_transcendentals */ .lower_fsign = !screen->specs.has_sign_floor_ceil, .lower_ffloor = !screen->specs.has_sign_floor_ceil, diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c index 40998acaebb..46c7ad7ff0f 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -47,6 +47,8 @@ static const nir_shader_compiler_options options = { .lower_fdph = true, .has_fsub = true, .has_isub = true, + .lower_insert_byte = true, + .lower_insert_word = true, }; const nir_shader_compiler_options * diff --git a/src/gallium/drivers/lima/lima_program.c b/src/gallium/drivers/lima/lima_program.c index 40af5c029be..562586b851e 100644 --- a/src/gallium/drivers/lima/lima_program.c +++ b/src/gallium/drivers/lima/lima_program.c @@ -59,6 +59,8 @@ static const nir_shader_compiler_options vs_nir_options = { .lower_rotate = true, .lower_sincos = true, .lower_fceil = true, + .lower_insert_byte = true, + .lower_insert_word = true, }; static const nir_shader_compiler_options fs_nir_options = { @@ -74,6 +76,8 @@ static const nir_shader_compiler_options fs_nir_options = { .lower_rotate = true, .lower_fdot = true, .lower_fdph = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_bitops = true, .lower_vector_cmp = true, }; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index c1d642bc299..acf3e5d9224 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -577,6 +577,8 @@ static const struct nir_shader_compiler_options gallivm_nir_options = { .lower_unpack_half_2x16 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_rotate = true, .lower_uadd_carry = true, .lower_usub_borrow = true, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp index 90150065f86..bb2e8c7d062 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp @@ -3265,6 +3265,8 @@ nvir_nir_shader_compiler_options(int chipset) op.lower_pack_split = false; op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET); op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET); + op.lower_insert_byte = true; + op.lower_insert_word = true; op.lower_all_io_to_temps = false; op.lower_all_io_to_elements = false; op.vertex_id_zero_based = false; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index a8ad5526d82..cd9946baed6 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -978,6 +978,8 @@ static const nir_shader_compiler_options nir_options = { .lower_unpack_snorm_4x8 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_all_io_to_temps = false, .lower_cs_local_index_from_id = true, .lower_rotate = true, diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c index 5c6a02087d4..a23cf2acf48 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.c +++ b/src/gallium/drivers/r600/r600_pipe_common.c @@ -1334,6 +1334,8 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, .lower_int64_options = ~0, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_rotate = true, .max_unroll_iterations = 32, .lower_interpolate_at = true, diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 817d0e6cc64..569515a87d1 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -1004,6 +1004,8 @@ void si_init_screen_get_functions(struct si_screen *sscreen) .lower_unpack_unorm_4x8 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_rotate = true, .lower_to_scalar = true, .optimize_sample_mask_in = true, diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index b652b4d260e..eecc5c1072c 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -78,6 +78,8 @@ static const nir_shader_compiler_options sp_compiler_options = { .fuse_ffma64 = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_fdph = true, .lower_flrp64 = true, .lower_fmod = true, diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index 5b7d8d3c3ee..6112e4b1290 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -643,6 +643,8 @@ static const nir_shader_compiler_options v3d_nir_options = { .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_bitfield_insert_to_shifts = true, .lower_bitfield_extract_to_shifts = true, .lower_bitfield_reverse = true, diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 3ba8e6e5a94..f3c57942621 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2173,6 +2173,8 @@ static const nir_shader_compiler_options nir_options = { .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_fdiv = true, .lower_ffma16 = true, .lower_ffma32 = true, diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index c5a592fa76e..5fff56d3dff 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -379,6 +379,8 @@ zink_screen_init_compiler(struct zink_screen *screen) .lower_fsat = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_mul_high = true, .lower_rotate = true, .lower_uadd_carry = true, diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 8b368bc0766..5ec16f958cf 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -45,6 +45,8 @@ .lower_device_index_to_zero = true, \ .vectorize_io = true, \ .use_interpolated_input_intrinsics = true, \ + .lower_insert_byte = true, \ + .lower_insert_word = true, \ .vertex_id_zero_based = true, \ .lower_base_vertex = true, \ .use_scoped_barrier = true, \ diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index 9952c395e52..c25d3fc9ad2 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -90,6 +90,8 @@ nir_options = { .lower_bitfield_extract_to_shifts = true, .lower_extract_word = true, .lower_extract_byte = true, + .lower_insert_word = true, + .lower_insert_byte = true, .lower_all_io_to_elements = true, .lower_all_io_to_temps = true, .lower_hadd = true, diff --git a/src/panfrost/bifrost/bifrost_compile.h b/src/panfrost/bifrost/bifrost_compile.h index 53cb19d7323..88433f1a292 100644 --- a/src/panfrost/bifrost/bifrost_compile.h +++ b/src/panfrost/bifrost/bifrost_compile.h @@ -56,6 +56,8 @@ static const nir_shader_compiler_options bifrost_nir_options = { .lower_bitfield_extract_to_shifts = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_rotate = true, .lower_pack_half_2x16 = true, diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h index a4c9e1626bc..242ac116216 100644 --- a/src/panfrost/midgard/midgard_compile.h +++ b/src/panfrost/midgard/midgard_compile.h @@ -68,6 +68,8 @@ static const nir_shader_compiler_options midgard_nir_options = { .lower_bitfield_extract_to_shifts = true, .lower_extract_byte = true, .lower_extract_word = true, + .lower_insert_byte = true, + .lower_insert_word = true, .lower_rotate = true, .lower_pack_half_2x16 = true,