nir: rework and fix rotate lowering

No driver supports urol/uror on all bit sizes. Intel gen11+ only for 16 and 32 bit, Nvidia GV100+ only for 32 bit. Etnaviv can support it on 8, 16 and 32 bit. Also turn the `lower` into a `has` option as only two drivers actually support `uror` and `urol` at this momemt. Fixes crashes with CL integer_rotate on iris and nouveau since we emit urol for `rotate`. v2: always lower 64 bit Fixes: fe0965afa6 ("spirv: Don't use libclc for rotate") Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by (Intel and nir): Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: David Heidelberg <david.heidelberg@collabora.com> Acked-by: Yonggang Luo <luoyonggang@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27090> (cherry picked from commit f2b7c4ce29)
2024-01-16 12:55:10 +01:00
parent 1f89910f6b
commit 08229beb4e
26 changed files with 30 additions and 46 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -314,7 +314,7 @@
        "description": "nir: rework and fix rotate lowering",
        "nominated": true,
        "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
        "main_sha": null,
        "because_sha": "fe0965afa6becfc9c9aa341babd34bc5920e421b",
        "notes": null
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -93,7 +93,6 @@ get_nir_options_for_stage(struct radv_physical_device *device, gl_shader_stage s
      .lower_ffma64 = split_fma,
      .lower_fpow = true,
      .lower_mul_2x32_64 = true,
-      .lower_rotate = true,
      .lower_iadd_sat = device->rad_info.gfx_level <= GFX8,
      .lower_hadd = true,
      .lower_mul_32x16 = true,
--- a/src/asahi/compiler/agx_compile.h
+++ b/src/asahi/compiler/agx_compile.h
@@ -275,7 +275,6 @@ static const nir_shader_compiler_options agx_nir_options = {
   .lower_hadd = true,
   .vectorize_io = true,
   .use_interpolated_input_intrinsics = true,
-   .lower_rotate = true,
   .has_isub = true,
   .support_16bit_alu = true,
   .max_unroll_iterations = 32,
--- a/src/broadcom/vulkan/v3dv_pipeline.c
+++ b/src/broadcom/vulkan/v3dv_pipeline.c
@@ -228,7 +228,6 @@ const nir_shader_compiler_options v3dv_nir_options = {
   .lower_ldexp = true,
   .lower_mul_high = true,
   .lower_wpos_pntc = false,
-   .lower_rotate = true,
   .lower_to_scalar = true,
   .lower_device_index_to_zero = true,
   .lower_fquantize2f16 = true,
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3789,8 +3789,10 @@ typedef struct nir_shader_compiler_options {
   /* Lowers when 32x32->64 bit multiplication is not supported */
   bool lower_mul_2x32_64;

-   /* Lowers when rotate instruction is not supported */
-   bool lower_rotate;
+   /* Indicates that urol and uror are supported */
+   bool has_rotate8;
+   bool has_rotate16;
+   bool has_rotate32;

   /** Backend supports ternary addition */
   bool has_iadd3;
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -1380,22 +1380,22 @@ optimizations.extend([
   (('ishr', a, 0), a),
   (('ushr', 0, a), 0),
   (('ushr', a, 0), a),
-   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
-   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
-   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
-   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
-   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
-   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
-   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
-   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
-   (('urol@8',  a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub',  8, b))), 'options->lower_rotate'),
-   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
-   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
-   (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b))), 'options->lower_rotate'),
-   (('uror@8',  a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub',  8, b))), 'options->lower_rotate'),
-   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
-   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
-   (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b))), 'options->lower_rotate'),
+   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'),
+   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'),
+   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'),
+   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'),
+   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'),
+   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'),
+   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'),
+   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'),
+   (('urol@8',  a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub',  8, b))), '!options->has_rotate8'),
+   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'),
+   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'),
+   (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))),
+   (('uror@8',  a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub',  8, b))), '!options->has_rotate8'),
+   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'),
+   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'),
+   (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))),

   # bfi(X, a, b) = (b & ~X) | (a & X)
   # If X = ~0: (b & 0) | (a & 0xffffffff) = a
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -109,7 +109,6 @@ static const nir_shader_compiler_options ir3_base_options = {
   .lower_unpack_unorm_2x16 = true,
   .lower_pack_split = true,
   .use_interpolated_input_intrinsics = true,
-   .lower_rotate = true,
   .lower_to_scalar = true,
   .has_imul24 = true,
   .has_fsub = true,
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -3689,13 +3689,15 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
       !options->lower_fdph ||
       !options->lower_flrp64 ||
       !options->lower_fmod ||
-       !options->lower_rotate ||
       !options->lower_uadd_carry ||
       !options->lower_usub_borrow ||
       !options->lower_uadd_sat ||
       !options->lower_usub_sat ||
       !options->lower_uniforms_to_ubo ||
       !options->lower_vector_cmp ||
+       options->has_rotate8 ||
+       options->has_rotate16 ||
+       options->has_rotate32 ||
       options->lower_fsqrt != lower_fsqrt ||
       options->force_indirect_unrolling != no_indirects_mask ||
       force_indirect_unrolling_sampler) {
@@ -3709,7 +3711,6 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
      new_options->lower_fdph = true;
      new_options->lower_flrp64 = true;
      new_options->lower_fmod = true;
-      new_options->lower_rotate = true;
      new_options->lower_uadd_carry = true;
      new_options->lower_usub_borrow = true;
      new_options->lower_uadd_sat = true;
@@ -3717,6 +3718,9 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s,
      new_options->lower_uniforms_to_ubo = true;
      new_options->lower_vector_cmp = true;
      new_options->lower_fsqrt = lower_fsqrt;
+      new_options->has_rotate8 = false;
+      new_options->has_rotate16 = false;
+      new_options->has_rotate32 = false;
      new_options->force_indirect_unrolling = no_indirects_mask;
      new_options->force_indirect_unrolling_sampler = force_indirect_unrolling_sampler;

@@ -4062,7 +4066,6 @@ static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
   .lower_fdph = true,
   .lower_flrp64 = true,
   .lower_fmod = true,
-   .lower_rotate = true,
   .lower_uniforms_to_ubo = true,
   .lower_uadd_carry = true,
   .lower_usub_borrow = true,
--- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -43,7 +43,6 @@ static const nir_shader_compiler_options options = {
   .lower_all_io_to_temps = true,
   .vertex_id_zero_based = true, /* its not implemented anyway */
   .lower_bitops = true,
-   .lower_rotate = true,
   .lower_vector_cmp = true,
   .lower_fdph = true,
   .has_fsub = true,
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -117,7 +117,6 @@ static const nir_shader_compiler_options i915_compiler_options = {
   .lower_fdph = true,
   .lower_flrp32 = true,
   .lower_fmod = true,
-   .lower_rotate = true,
   .lower_sincos = true,
   .lower_uniforms_to_ubo = true,
   .lower_vector_cmp = true,
@@ -161,7 +160,6 @@ static const struct nir_shader_compiler_options gallivm_nir_options = {
   .lower_unpack_half_2x16 = true,
   .lower_extract_byte = true,
   .lower_extract_word = true,
-   .lower_rotate = true,
   .lower_uadd_carry = true,
   .lower_usub_borrow = true,
   .lower_mul_2x32_64 = true,
--- a/src/gallium/drivers/lima/lima_program.c
+++ b/src/gallium/drivers/lima/lima_program.c
@@ -57,7 +57,6 @@ static const nir_shader_compiler_options vs_nir_options = {
   /* could be implemented by clamp */
   .lower_fsat = true,
   .lower_bitops = true,
-   .lower_rotate = true,
   .lower_sincos = true,
   .lower_fceil = true,
   .lower_insert_byte = true,
@@ -78,7 +77,6 @@ static const nir_shader_compiler_options fs_nir_options = {
   .lower_flrp32 = true,
   .lower_flrp64 = true,
   .lower_fsign = true,
-   .lower_rotate = true,
   .lower_fdot = true,
   .lower_fdph = true,
   .lower_insert_byte = true,
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -616,7 +616,6 @@ static const struct nir_shader_compiler_options gallivm_nir_options = {
   .lower_extract_word = true,
   .lower_insert_byte = true,
   .lower_insert_word = true,
-   .lower_rotate = true,
   .lower_uadd_carry = true,
   .lower_usub_borrow = true,
   .lower_mul_2x32_64 = true,
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -477,7 +477,6 @@ static const nir_shader_compiler_options nv30_base_compiler_options = {
   .lower_flrp64 = true,
   .lower_fmod = true,
   .lower_fpow = true, /* In hardware as of nv40 FS */
-   .lower_rotate = true,
   .lower_uniforms_to_ubo = true,
   .lower_vector_cmp = true,
   .force_indirect_unrolling = nir_var_all,
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -503,7 +503,6 @@ static int r300_get_video_param(struct pipe_screen *screen,
   .lower_ftrunc = true,                      \
   .lower_insert_byte = true,                 \
   .lower_insert_word = true,                 \
-   .lower_rotate = true,                      \
   .lower_uniforms_to_ubo = true,             \
   .lower_vector_cmp = true,                  \
   .no_integers = true,                       \
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -1394,7 +1394,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 		.lower_insert_byte = true,
 		.lower_insert_word = true,
 		.lower_ldexp = true,
-		.lower_rotate = true,
 		/* due to a bug in the shader compiler, some loops hang
 		 * if they are not unrolled, see:
 		 *    https://bugs.freedesktop.org/show_bug.cgi?id=86720
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -1323,7 +1323,6 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
      .lower_hadd = true,
      .lower_hadd64 = true,
      .lower_fisnormal = true,
-      .lower_rotate = true,
      .lower_to_scalar = true,
      .lower_to_scalar_filter = sscreen->info.has_packed_math_16bit ?
                                   si_alu_to_scalar_packed_math_filter : NULL,
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -85,7 +85,6 @@ static const nir_shader_compiler_options sp_compiler_options = {
   .lower_fdph = true,
   .lower_flrp64 = true,
   .lower_fmod = true,
-   .lower_rotate = true,
   .lower_uniforms_to_ubo = true,
   .lower_vector_cmp = true,
   .lower_int64_options = nir_lower_imul_2x32_64,
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -737,7 +737,6 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
   .lower_fdph = true,                                                        \
   .lower_flrp64 = true,                                                      \
   .lower_ldexp = true,                                                       \
-   .lower_rotate = true,                                                      \
   .lower_uniforms_to_ubo = true,                                             \
   .lower_vector_cmp = true,                                                  \
   .lower_cs_local_index_to_id = true,                                        \
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -729,7 +729,6 @@ static const nir_shader_compiler_options v3d_nir_options = {
        .lower_ldexp = true,
        .lower_mul_high = true,
        .lower_wpos_pntc = true,
-        .lower_rotate = true,
        .lower_to_scalar = true,
        .lower_int64_options = nir_lower_imul_2x32_64,
        .lower_fquantize2f16 = true,
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2174,7 +2174,6 @@ static const nir_shader_compiler_options nir_options = {
        .lower_ldexp = true,
        .lower_fneg = true,
        .lower_ineg = true,
-        .lower_rotate = true,
        .lower_to_scalar = true,
        .lower_umax = true,
        .lower_umin = true,
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -1227,7 +1227,6 @@ zink_screen_init_compiler(struct zink_screen *screen)
      .lower_ldexp = true,

      .lower_mul_high = true,
-      .lower_rotate = true,
      .lower_uadd_carry = true,
      .lower_usub_borrow = true,
      .lower_uadd_sat = true,
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -189,7 +189,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
      nir_options->has_bfm = devinfo->ver >= 7;
      nir_options->has_bfi = devinfo->ver >= 7;

-      nir_options->lower_rotate = devinfo->ver < 11;
+      nir_options->has_rotate16 = devinfo->ver >= 11;
+      nir_options->has_rotate32 = devinfo->ver >= 11;
      nir_options->lower_bitfield_reverse = devinfo->ver < 7;
      nir_options->lower_find_lsb = devinfo->ver < 7;
      nir_options->lower_ifind_msb = devinfo->ver < 7;
--- a/src/microsoft/compiler/nir_to_dxil.c
+++ b/src/microsoft/compiler/nir_to_dxil.c
@@ -119,7 +119,6 @@ nir_options = {
   .lower_uadd_carry = true,
   .lower_usub_borrow = true,
   .lower_mul_high = true,
-   .lower_rotate = true,
   .lower_pack_half_2x16 = true,
   .lower_pack_unorm_4x8 = true,
   .lower_pack_snorm_4x8 = true,
--- a/src/nouveau/codegen/nv50_ir_from_nir.cpp
+++ b/src/nouveau/codegen/nv50_ir_from_nir.cpp
@@ -3465,7 +3465,7 @@ nvir_nir_shader_compiler_options(int chipset, uint8_t shader_type)
   op.unify_interfaces = false;
   op.use_interpolated_input_intrinsics = true;
   op.lower_mul_2x32_64 = true; // TODO
-   op.lower_rotate = (chipset < NVISA_GV100_CHIPSET);
+   op.has_rotate32 = (chipset >= NVISA_GV100_CHIPSET);
   op.has_imul24 = false;
   op.has_fmulz = (chipset > NVISA_G80_CHIPSET);
   op.intel_vec4 = false;
--- a/src/panfrost/compiler/bifrost_compile.h
+++ b/src/panfrost/compiler/bifrost_compile.h
@@ -55,7 +55,6 @@ void bifrost_compile_shader_nir(nir_shader *nir,
      .lower_bitfield_insert = true,                                           \
      .lower_bitfield_extract = true,                                          \
      .lower_insert_byte = true,                                               \
-      .lower_rotate = true,                                                    \
                                                                               \
      /* Vertex ID is zero based in the traditional geometry flows, but not in \
       * the memory-allocated IDVS flow introduced and used exclusively in     \
--- a/src/panfrost/midgard/midgard_compile.h
+++ b/src/panfrost/midgard/midgard_compile.h
@@ -73,7 +73,6 @@ static const nir_shader_compiler_options midgard_nir_options = {
   .lower_insert_byte = true,
   .lower_insert_word = true,
   .lower_ldexp = true,
-   .lower_rotate = true,

   .lower_pack_half_2x16 = true,
   .lower_pack_unorm_2x16 = true,