pan/bi: Implement some extracts and inserts

Rather than lowering in NIR. Importantly for Valhall, this allows nir_opt_algebraic to optimize various bitwise ops into extracts and inserts, taking pressure off the low-throughout SFU pipe and moving it onto the high-throughput CVT pipe. This will mitigate a cycle count regression from switching to the precise idiv lowering. This also generates more integer widening conversions which we can fold into 32-bit instructions later, to allow optimizing GLSL like "(a & 0xFFFF) + b" Valhall: total instructions in shared programs: 2674836 -> 2674840 (<.01%) instructions in affected programs: 6473 -> 6477 (0.06%) helped: 14 HURT: 6 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.16% max: 1.37% x̄: 0.41% x̃: 0.49% HURT stats (abs) min: 3.0 max: 3.0 x̄: 3.00 x̃: 3 HURT stats (rel) min: 1.19% max: 1.62% x̄: 1.35% x̃: 1.24% 95% mean confidence interval for instructions value: -0.68 1.08 95% mean confidence interval for instructions %-change: -0.30% 0.53% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 140627.42 -> 140627.36 (<.01%) cycles in affected programs: 2.31 -> 2.25 (-2.70%) helped: 1 HURT: 0 total cvt in shared programs: 14127.25 -> 14128.91 (0.01%) cvt in affected programs: 153.50 -> 155.16 (1.08%) helped: 0 HURT: 41 HURT stats (abs) min: 0.015625 max: 0.09375 x̄: 0.04 x̃: 0 HURT stats (rel) min: 0.27% max: 4.44% x̄: 1.61% x̃: 1.22% 95% mean confidence interval for cvt value: 0.03 0.05 95% mean confidence interval for cvt %-change: 1.29% 1.93% Cvt are HURT. total sfu in shared programs: 7555.69 -> 7549.31 (-0.08%) sfu in affected programs: 107.31 -> 100.94 (-5.94%) helped: 48 HURT: 0 helped stats (abs) min: 0.0625 max: 0.375 x̄: 0.13 x̃: 0 helped stats (rel) min: 1.34% max: 50.00% x̄: 13.57% x̃: 7.14% 95% mean confidence interval for sfu value: -0.15 -0.11 95% mean confidence interval for sfu %-change: -17.07% -10.06% Sfu are helped. total quadwords in shared programs: 1449912 -> 1449928 (<.01%) quadwords in affected programs: 256 -> 272 (6.25%) helped: 0 HURT: 2 Bifrost: total instructions in shared programs: 2415370 -> 2415380 (<.01%) instructions in affected programs: 1642 -> 1652 (0.61%) helped: 2 HURT: 6 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.40% max: 0.40% x̄: 0.40% x̃: 0.40% HURT stats (abs) min: 2.0 max: 2.0 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.95% max: 1.27% x̄: 1.07% x̃: 1.00% 95% mean confidence interval for instructions value: 0.09 2.41 95% mean confidence interval for instructions %-change: 0.13% 1.29% Instructions are HURT. total tuples in shared programs: 1928495 -> 1928476 (<.01%) tuples in affected programs: 3329 -> 3310 (-0.57%) helped: 9 HURT: 2 helped stats (abs) min: 1.0 max: 6.0 x̄: 2.56 x̃: 2 helped stats (rel) min: 0.25% max: 2.33% x̄: 1.00% x̃: 0.75% HURT stats (abs) min: 2.0 max: 2.0 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.48% max: 0.48% x̄: 0.48% x̃: 0.48% 95% mean confidence interval for tuples value: -3.46 0.00 95% mean confidence interval for tuples %-change: -1.35% -0.10% Inconclusive result (value mean confidence interval includes 0). total clauses in shared programs: 354978 -> 354983 (<.01%) clauses in affected programs: 398 -> 403 (1.26%) helped: 3 HURT: 8 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 2.33% max: 3.85% x̄: 2.83% x̃: 2.33% HURT stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 HURT stats (rel) min: 2.27% max: 3.70% x̄: 2.88% x̃: 2.78% 95% mean confidence interval for clauses value: -0.17 1.08 95% mean confidence interval for clauses %-change: -0.51% 3.16% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 166575.69 -> 166575.65 (<.01%) cycles in affected programs: 6.88 -> 6.83 (-0.61%) helped: 1 HURT: 0 total arith in shared programs: 73688.79 -> 73688 (<.01%) arith in affected programs: 127.29 -> 126.50 (-0.62%) helped: 9 HURT: 2 helped stats (abs) min: 0.04166700000000034 max: 0.25 x̄: 0.11 x̃: 0 helped stats (rel) min: 0.26% max: 2.45% x̄: 1.07% x̃: 0.80% HURT stats (abs) min: 0.08333299999999966 max: 0.08333299999999966 x̄: 0.08 x̃: 0 HURT stats (rel) min: 0.55% max: 0.55% x̄: 0.55% x̃: 0.55% 95% mean confidence interval for arith value: -0.14 0.00 95% mean confidence interval for arith %-change: -1.44% -0.11% Inconclusive result (value mean confidence interval includes 0). total quadwords in shared programs: 1674514 -> 1674480 (<.01%) quadwords in affected programs: 9086 -> 9052 (-0.37%) helped: 23 HURT: 2 helped stats (abs) min: 1.0 max: 6.0 x̄: 1.65 x̃: 1 helped stats (rel) min: 0.15% max: 2.79% x̄: 0.63% x̃: 0.33% HURT stats (abs) min: 2.0 max: 2.0 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.53% max: 0.53% x̄: 0.53% x̃: 0.53% 95% mean confidence interval for quadwords value: -2.08 -0.64 95% mean confidence interval for quadwords %-change: -0.86% -0.21% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17857>
2022-08-02 10:37:52 -04:00
parent 469e8c8e22
commit eab1d36643
2 changed files with 55 additions and 3 deletions
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -2625,6 +2625,56 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
                break;

+        case nir_op_extract_u8:
+        case nir_op_extract_i8: {
+                assert(comps == 1 && "should be scalarized");
+                assert((src_sz == 16 || src_sz == 32) && "should be lowered");
+                unsigned byte = nir_src_as_uint(instr->src[1].src);
+
+                if (s0.swizzle == BI_SWIZZLE_H11) {
+                        assert(byte < 2);
+                        byte += 2;
+                } else if (s0.swizzle != BI_SWIZZLE_H01) {
+                        assert(s0.swizzle == BI_SWIZZLE_H00);
+                }
+
+                assert(byte < 4);
+
+                s0.swizzle = BI_SWIZZLE_H01;
+
+                if (instr->op == nir_op_extract_i8)
+                        bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
+                else
+                        bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
+                break;
+        }
+
+        case nir_op_extract_u16:
+        case nir_op_extract_i16: {
+                assert(comps == 1 && "should be scalarized");
+                assert(src_sz == 32 && "should be lowered");
+                unsigned half = nir_src_as_uint(instr->src[1].src);
+                assert(half == 0 || half == 1);
+
+                if (instr->op == nir_op_extract_i16)
+                        bi_s16_to_s32_to(b, dst, bi_half(s0, half));
+                else
+                        bi_u16_to_u32_to(b, dst, bi_half(s0, half));
+                break;
+        }
+
+        case nir_op_insert_u16: {
+                assert(comps == 1 && "should be scalarized");
+                unsigned half = nir_src_as_uint(instr->src[1].src);
+                assert(half == 0 || half == 1);
+
+                if (half == 0)
+                        bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
+                else
+                        bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
+                break;
+        }
+
        case nir_op_ishl:
                bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
                break;
@@ -4319,6 +4369,11 @@ bi_vectorize_filter(const nir_instr *instr, const void *data)
        case nir_op_ushr:
        case nir_op_f2i16:
        case nir_op_f2u16:
+        case nir_op_extract_u8:
+        case nir_op_extract_i8:
+        case nir_op_extract_u16:
+        case nir_op_extract_i16:
+        case nir_op_insert_u16:
                return 1;
        default:
                break;
--- a/src/panfrost/bifrost/bifrost_compile.h
+++ b/src/panfrost/bifrost/bifrost_compile.h
@@ -52,10 +52,7 @@ static const nir_shader_compiler_options bifrost_nir_options = {

        .lower_bitfield_insert_to_shifts = true,
        .lower_bitfield_extract_to_shifts = true,
-        .lower_extract_byte = true,
-        .lower_extract_word = true,
        .lower_insert_byte = true,
-        .lower_insert_word = true,
        .lower_rotate = true,

        .lower_pack_half_2x16 = true,