pan/bi: Implement fsin/fcos

Instead of lowering it in NIR, use the lookup tables as inputs to a second-order Taylor expansion. shader-db results aren't amazing but keep in mind this is without backend CSE yet. total instructions in shared programs: 115913 -> 115707 (-0.18%) instructions in affected programs: 3151 -> 2945 (-6.54%) helped: 12 HURT: 0 Instructions are helped. total nops in shared programs: 84045 -> 84041 (<.01%) nops in affected programs: 1571 -> 1567 (-0.25%) helped: 1 HURT: 7 Inconclusive result (value mean confidence interval includes 0). total clauses in shared programs: 20498 -> 20489 (-0.04%) clauses in affected programs: 188 -> 179 (-4.79%) helped: 6 HURT: 0 Clauses are helped. total quadwords in shared programs: 90395 -> 90291 (-0.12%) quadwords in affected programs: 2287 -> 2183 (-4.55%) helped: 12 HURT: 0 Quadwords are helped. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9420>
2021-03-05 02:19:22 +00:00
parent 253b795451
commit 718bfdb3da
2 changed files with 56 additions and 1 deletions
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -1450,6 +1450,54 @@ bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
        bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE);
 }

+/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
+ * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
+ * calculates the results. We use them to calculate sin/cos via a Taylor
+ * approximation:
+ *
+ * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
+ * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
+ * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
+ */
+
+#define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
+#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
+#define SINCOS_BIAS  bi_imm_u32(0x49400000)
+
+static void
+bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
+{
+        /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
+        bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE);
+
+        /* Approximate domain error (small) */
+        bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS),
+                                BI_ROUND_NONE),
+                        MPI_OVER_TWO, s0, BI_ROUND_NONE);
+
+        /* Lookup sin(x), cos(x) */
+        bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
+        bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
+
+        /* e^2 / 2 */
+        bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_neg(bi_zero()),
+                        bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE);
+
+        /* (-e^2)/2 f''(x) */
+        bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2),
+                        cos ? cosx : sinx,
+                        bi_neg(bi_zero()),  BI_ROUND_NONE);
+
+        /* e f'(x) - (e^2/2) f''(x) */
+        bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
+                        cos ? bi_neg(sinx) : cosx,
+                        quadratic, BI_ROUND_NONE);
+        I->clamp = BI_CLAMP_CLAMP_M1_1;
+
+        /* f(x) + e f'(x) - (e^2/2) f''(x) */
+        bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE);
+}
+
 static void
 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
 {
@@ -1575,6 +1623,14 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                bi_fadd_to(b, sz, dst, bi_abs(s0), bi_zero(), BI_ROUND_NONE);
                break;

+        case nir_op_fsin:
+                bi_lower_fsincos_32(b, dst, s0, false);
+                break;
+
+        case nir_op_fcos:
+                bi_lower_fsincos_32(b, dst, s0, true);
+                break;
+
        case nir_op_fexp2: {
                assert(sz == 32); /* should've been lowered */

--- a/src/panfrost/bifrost/bifrost_compile.h
+++ b/src/panfrost/bifrost/bifrost_compile.h
@@ -48,7 +48,6 @@ static const nir_shader_compiler_options bifrost_nir_options = {
        .lower_ifind_msb = true,
        .lower_fdph = true,
        .lower_fsqrt = true,
-        .lower_sincos = true,

        .lower_wpos_pntc = true,
        .lower_fsign = true,