v3d: don't lower fsat on V3D 7.x

This requires that our nir compiler options are different between V3D versions so we can't use a static global any more. total instructions in shared programs: 11241106 -> 11047872 (-1.72%) instructions in affected programs: 4634458 -> 4441224 (-4.17%) helped: 25119 HURT: 1717 Instructions are helped. total threads in shared programs: 425238 -> 425036 (-0.05%) threads in affected programs: 878 -> 676 (-23.01%) helped: 79 HURT: 180 Inconclusive result (%-change mean confidence interval includes 0). total loops in shared programs: 1968 -> 1933 (-1.78%) loops in affected programs: 35 -> 0 helped: 35 HURT: 0 Loops are helped. total uniforms in shared programs: 3845314 -> 3845219 (<.01%) uniforms in affected programs: 213615 -> 213520 (-0.04%) helped: 1338 HURT: 1059 Inconclusive result (value mean confidence interval includes 0). total max-temps in shared programs: 2224313 -> 2221507 (-0.13%) max-temps in affected programs: 236054 -> 233248 (-1.19%) helped: 4863 HURT: 3357 Max-temps are helped. total spills in shared programs: 4264 -> 4294 (0.70%) spills in affected programs: 274 -> 304 (10.95%) helped: 8 HURT: 16 total fills in shared programs: 6638 -> 6497 (-2.12%) fills in affected programs: 2240 -> 2099 (-6.29%) helped: 55 HURT: 17 total sfu-stalls in shared programs: 14942 -> 14353 (-3.94%) sfu-stalls in affected programs: 4863 -> 4274 (-12.11%) helped: 1287 HURT: 1165 Sfu-stalls are helped. total inst-and-stalls in shared programs: 11256048 -> 11062225 (-1.72%) inst-and-stalls in affected programs: 4635701 -> 4441878 (-4.18%) helped: 25074 HURT: 1728 Inst-and-stalls are helped. total nops in shared programs: 270482 -> 270621 (0.05%) nops in affected programs: 27579 -> 27718 (0.50%) helped: 1583 HURT: 1967 Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Juan A. Suarez <jasuarez@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30086>
2024-07-08 12:58:23 +02:00
parent 33187012ab
commit d3a684803d
1 changed files with 81 additions and 72 deletions
--- a/src/gallium/drivers/v3d/v3d_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -673,81 +673,90 @@ v3d_screen_is_format_supported(struct pipe_screen *pscreen,
        return true;
 }

-static const nir_shader_compiler_options v3d_nir_options = {
-        .compact_arrays = true,
-        .lower_uadd_sat = true,
-        .lower_usub_sat = true,
-        .lower_iadd_sat = true,
-        .lower_all_io_to_temps = true,
-        .lower_extract_byte = true,
-        .lower_extract_word = true,
-        .lower_insert_byte = true,
-        .lower_insert_word = true,
-        .lower_bitfield_insert = true,
-        .lower_bitfield_extract = true,
-        .lower_bitfield_reverse = true,
-        .lower_bit_count = true,
-        .lower_cs_local_id_to_index = true,
-        .lower_ffract = true,
-        .lower_fmod = true,
-        .lower_pack_unorm_2x16 = true,
-        .lower_pack_snorm_2x16 = true,
-        .lower_pack_unorm_4x8 = true,
-        .lower_pack_snorm_4x8 = true,
-        .lower_unpack_unorm_4x8 = true,
-        .lower_unpack_snorm_4x8 = true,
-        .lower_pack_half_2x16 = true,
-        .lower_unpack_half_2x16 = true,
-        .lower_pack_32_2x16 = true,
-        .lower_pack_32_2x16_split = true,
-        .lower_unpack_32_2x16_split = true,
-        .lower_fdiv = true,
-        .lower_find_lsb = true,
-        .lower_ffma16 = true,
-        .lower_ffma32 = true,
-        .lower_ffma64 = true,
-        .lower_flrp32 = true,
-        .lower_fpow = true,
-        .lower_fsat = true,
-        .lower_fsqrt = true,
-        .lower_ifind_msb = true,
-        .lower_isign = true,
-        .lower_ldexp = true,
-        .lower_hadd = true,
-        .lower_fisnormal = true,
-        .lower_mul_high = true,
-        .lower_wpos_pntc = true,
-        .lower_to_scalar = true,
-        .lower_int64_options =
-                nir_lower_bcsel64 |
-                nir_lower_conv64 |
-                nir_lower_iadd64 |
-                nir_lower_icmp64 |
-                nir_lower_imul_2x32_64 |
-                nir_lower_imul64 |
-                nir_lower_ineg64 |
-                nir_lower_logic64 |
-                nir_lower_shift64 |
-                nir_lower_ufind_msb64,
-        .lower_fquantize2f16 = true,
-        .has_fsub = true,
-        .has_isub = true,
-        .divergence_analysis_options =
-                nir_divergence_multiple_workgroup_per_compute_subgroup,
-        /* This will enable loop unrolling in the state tracker so we won't
-         * be able to selectively disable it in backend if it leads to
-         * lower thread counts or TMU spills. Choose a conservative maximum to
-         * limit register pressure impact.
-         */
-        .max_unroll_iterations = 16,
-        .force_indirect_unrolling_sampler = true,
-};
-
 static const void *
 v3d_screen_get_compiler_options(struct pipe_screen *pscreen,
-                                enum pipe_shader_ir ir, enum pipe_shader_type shader)
+                                enum pipe_shader_ir ir,
+                                enum pipe_shader_type shader)
 {
-        return &v3d_nir_options;
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        const struct v3d_device_info *devinfo = &screen->devinfo;
+
+        static bool initialized = false;
+        static nir_shader_compiler_options options = {
+                .compact_arrays = true,
+                .lower_uadd_sat = true,
+                .lower_usub_sat = true,
+                .lower_iadd_sat = true,
+                .lower_all_io_to_temps = true,
+                .lower_extract_byte = true,
+                .lower_extract_word = true,
+                .lower_insert_byte = true,
+                .lower_insert_word = true,
+                .lower_bitfield_insert = true,
+                .lower_bitfield_extract = true,
+                .lower_bitfield_reverse = true,
+                .lower_bit_count = true,
+                .lower_cs_local_id_to_index = true,
+                .lower_ffract = true,
+                .lower_fmod = true,
+                .lower_pack_unorm_2x16 = true,
+                .lower_pack_snorm_2x16 = true,
+                .lower_pack_unorm_4x8 = true,
+                .lower_pack_snorm_4x8 = true,
+                .lower_unpack_unorm_4x8 = true,
+                .lower_unpack_snorm_4x8 = true,
+                .lower_pack_half_2x16 = true,
+                .lower_unpack_half_2x16 = true,
+                .lower_pack_32_2x16 = true,
+                .lower_pack_32_2x16_split = true,
+                .lower_unpack_32_2x16_split = true,
+                .lower_fdiv = true,
+                .lower_find_lsb = true,
+                .lower_ffma16 = true,
+                .lower_ffma32 = true,
+                .lower_ffma64 = true,
+                .lower_flrp32 = true,
+                .lower_fpow = true,
+                .lower_fsqrt = true,
+                .lower_ifind_msb = true,
+                .lower_isign = true,
+                .lower_ldexp = true,
+                .lower_hadd = true,
+                .lower_fisnormal = true,
+                .lower_mul_high = true,
+                .lower_wpos_pntc = true,
+                .lower_to_scalar = true,
+                .lower_int64_options =
+                        nir_lower_bcsel64 |
+                        nir_lower_conv64 |
+                        nir_lower_iadd64 |
+                        nir_lower_icmp64 |
+                        nir_lower_imul_2x32_64 |
+                        nir_lower_imul64 |
+                        nir_lower_ineg64 |
+                        nir_lower_logic64 |
+                        nir_lower_shift64 |
+                        nir_lower_ufind_msb64,
+                .lower_fquantize2f16 = true,
+                .has_fsub = true,
+                .has_isub = true,
+                .divergence_analysis_options =
+                       nir_divergence_multiple_workgroup_per_compute_subgroup,
+                /* This will enable loop unrolling in the state tracker so we won't
+                 * be able to selectively disable it in backend if it leads to
+                 * lower thread counts or TMU spills. Choose a conservative maximum to
+                 * limit register pressure impact.
+                 */
+                .max_unroll_iterations = 16,
+                .force_indirect_unrolling_sampler = true,
+        };
+
+        if (!initialized) {
+                options.lower_fsat = devinfo->ver < 71;
+                initialized = true;
+        }
+
+        return &options;
 }

 static const uint64_t v3d_available_modifiers[] = {