gallivm: Use llvm.fmuladd.*.

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
2016-04-04 00:05:33 +01:00
parent 9e8edfa190
commit 320d1191c6
7 changed files with 98 additions and 68 deletions
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1123,10 +1123,8 @@ generate_viewport(struct draw_llvm_variant *variant,

      /* divide by w */
      out = LLVMBuildFMul(builder, out, out3, "");
-      /* mult by scale */
-      out = LLVMBuildFMul(builder, out, scale, "");
-      /* add translation */
-      out = LLVMBuildFAdd(builder, out, trans, "");
+      /* mult by scale, add translation */
+      out = lp_build_fmuladd(builder, out, scale, trans);

      /* store transformed outputs */
      LLVMBuildStore(builder, out, outputs[pos][i]);
@@ -1303,22 +1301,19 @@ generate_clipmask(struct draw_llvm *llvm,
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_y, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_y, sum);

            indices[2] = lp_build_const_int32(gallivm, 2);
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_z, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_z, sum);

            indices[2] = lp_build_const_int32(gallivm, 3);
            plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
            plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w");
            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
-            test = LLVMBuildFMul(builder, planes, cv_w, "");
-            sum = LLVMBuildFAdd(builder, sum, test, "");
+            sum = lp_build_fmuladd(builder, planes, cv_w, sum);

            test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum);
            temp = lp_build_const_int_vec(gallivm, i32_type, 1LL << plane_idx);
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -50,7 +50,6 @@
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 #include "util/u_math.h"
-#include "util/u_string.h"
 #include "util/u_cpu_detect.h"

 #include "lp_bld_type.h"
@@ -262,6 +261,22 @@ lp_build_min_simple(struct lp_build_context *bld,
 }


+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c)
+{
+   LLVMTypeRef type = LLVMTypeOf(a);
+   assert(type == LLVMTypeOf(b));
+   assert(type == LLVMTypeOf(c));
+   char intrinsic[32];
+   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
+   LLVMValueRef args[] = { a, b, c };
+   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
+}
+
+
 /**
 * Generate max(a, b)
 * No checks for special case values of a or b = 1 or 0 are done.
@@ -1023,6 +1038,22 @@ lp_build_mul(struct lp_build_context *bld,
 }


+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c)
+{
+   const struct lp_type type = bld->type;
+   if (type.floating) {
+      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
+   } else {
+      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
+   }
+}
+
+
 /**
 * Small vector x scale multiplication optimization.
 */
@@ -1153,6 +1184,11 @@ lp_build_lerp_simple(struct lp_build_context *bld,

   delta = lp_build_sub(bld, v1, v0);

+   if (bld->type.floating) {
+      assert(flags == 0);
+      return lp_build_mad(bld, x, delta, v0);
+   }
+
   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
      if (!bld->type.sign) {
         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
@@ -2717,23 +2753,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
   /*
    * The magic pass: "Extended precision modular arithmetic"
    * x = ((x - y * DP1) - y * DP2) - y * DP3;
-    * xmm1 = _mm_mul_ps(y, xmm1);
-    * xmm2 = _mm_mul_ps(y, xmm2);
-    * xmm3 = _mm_mul_ps(y, xmm3);
    */
-   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
-
-   /*
-    * x = _mm_add_ps(x, xmm1);
-    * x = _mm_add_ps(x, xmm2);
-    * x = _mm_add_ps(x, xmm3);
-    */
-
-   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
+   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
+   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);

   /*
    * Evaluate the first polynom  (0 <= x <= Pi/4)
@@ -2755,10 +2778,8 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
    * y = *(v4sf*)_ps_coscof_p0;
    * y = _mm_mul_ps(y, z);
    */
-   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
+   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");

@@ -2796,13 +2817,10 @@ lp_build_sin_or_cos(struct lp_build_context *bld,
    * y2 = _mm_add_ps(y2, x);
    */

-   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
+   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);

   /*
    * select the correct result from the two polynoms
@@ -2969,19 +2987,19 @@ lp_build_polynomial(struct lp_build_context *bld,

      if (i % 2 == 0) {
         if (even)
-            even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
+            even = lp_build_mad(bld, x2, even, coeff);
         else
            even = coeff;
      } else {
         if (odd)
-            odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
+            odd = lp_build_mad(bld, x2, odd, coeff);
         else
            odd = coeff;
      }
   }

   if (odd)
-      return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
+      return lp_build_mad(bld, odd, x, even);
   else if (even)
      return even;
   else
@@ -3212,7 +3230,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
   LLVMValueRef exp = NULL;
   LLVMValueRef mant = NULL;
   LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
+   LLVMValueRef p_z = NULL;
   LLVMValueRef res = NULL;

   assert(lp_check_value(bld->type, x));
@@ -3261,13 +3279,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
      z = lp_build_mul(bld, y, y);

      /* compute P(z) */
-      logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
-                                    ARRAY_SIZE(lp_build_log2_polynomial));
+      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
+                                ARRAY_SIZE(lp_build_log2_polynomial));

-      /* logmant = y * P(z) */
-      logmant = lp_build_mul(bld, y, logmant);
-
-      res = lp_build_add(bld, logmant, logexp);
+      /* y * P(z) + logexp */
+      res = lp_build_mad(bld, y, p_z, logexp);

      if (type.floating && handle_edge_cases) {
         LLVMValueRef negmask, infmask,  zmask;
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -87,6 +87,21 @@ lp_build_div(struct lp_build_context *bld,
             LLVMValueRef b);


+/* llvm.fmuladd.* intrinsic */
+LLVMValueRef
+lp_build_fmuladd(LLVMBuilderRef builder,
+                 LLVMValueRef a,
+                 LLVMValueRef b,
+                 LLVMValueRef c);
+
+/* a * b + c */
+LLVMValueRef
+lp_build_mad(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b,
+             LLVMValueRef c);
+
+
 /**
 * Set when the weights for normalized are prescaled, that is, in range
 * 0..2**n, as opposed to range 0..2**(n-1).
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
@@ -289,8 +289,7 @@ lp_build_linear_to_srgb(struct gallivm_state *gallivm,
      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);

      tmp = lp_build_mul(&f32_bld, a_const, x0375);
-      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
-      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
+      tmp2 = lp_build_mad(&f32_bld, b_const, x05, c_const);
      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
   }

--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -580,10 +580,8 @@ lp_build_brilinear_lod(struct lp_build_context *bld,

   lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);

-   lod_fpart = lp_build_mul(bld, lod_fpart,
-                            lp_build_const_vec(bld->gallivm, bld->type, factor));
-
-   lod_fpart = lp_build_add(bld, lod_fpart,
+   lod_fpart = lp_build_mad(bld, lod_fpart,
+                            lp_build_const_vec(bld->gallivm, bld->type, factor),
                            lp_build_const_vec(bld->gallivm, bld->type, post_offset));

   /*
@@ -639,10 +637,8 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
   /* fpart = rho / 2**ipart */
   lod_fpart = lp_build_extract_mantissa(bld, rho);

-   lod_fpart = lp_build_mul(bld, lod_fpart,
-                            lp_build_const_vec(bld->gallivm, bld->type, factor));
-
-   lod_fpart = lp_build_add(bld, lod_fpart,
+   lod_fpart = lp_build_mad(bld, lod_fpart,
+                            lp_build_const_vec(bld->gallivm, bld->type, factor),
                            lp_build_const_vec(bld->gallivm, bld->type, post_offset));

   /*
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1577,6 +1577,19 @@ log_emit_cpu(

 }

+/* TGSI_OPCODE_MAD (CPU Only) */
+
+static void
+mad_emit_cpu(
+   const struct lp_build_tgsi_action * action,
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   emit_data->output[emit_data->chan] =
+      lp_build_mad(&bld_base->base,
+                   emit_data->args[0], emit_data->args[1], emit_data->args[2]);
+}
+
 /* TGSI_OPCODE_MAX (CPU Only) */

 static void
@@ -2162,6 +2175,7 @@ lp_set_default_actions_cpu(

   bld_base->op_actions[TGSI_OPCODE_LG2].emit = lg2_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_LOG].emit = log_emit_cpu;
+   bld_base->op_actions[TGSI_OPCODE_MAD].emit = mad_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MAX].emit = max_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MIN].emit = min_emit_cpu;
   bld_base->op_actions[TGSI_OPCODE_MOD].emit = mod_emit_cpu;
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -307,10 +307,8 @@ attribs_update_simple(struct lp_build_interp_soa_context *bld,
               /*
                * a = a0 + (x * dadx + y * dady)
                */
-               dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
-               dady = LLVMBuildFMul(builder, dady, pixoffy, "");
-               a = LLVMBuildFAdd(builder, a, dadx, "");
-               a = LLVMBuildFAdd(builder, a, dady, "");
+               a = lp_build_fmuladd(builder, dadx, pixoffx, a);
+               a = lp_build_fmuladd(builder, dady, pixoffy, a);

               if (interp == LP_INTERP_PERSPECTIVE) {
                  if (oow == NULL) {
@@ -437,13 +435,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
       */
      if (interp != LP_INTERP_CONSTANT &&
          interp != LP_INTERP_FACING) {
-         LLVMValueRef axaos, ayaos;
-         axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
-                               dadxaos, "");
-         ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
-                               dadyaos, "");
-         a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
-         a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
+         LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
+         LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
+         a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
+         a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
      }

      /*