i965/tiled_memcpy: inline movntdqa loads in tiled_to_linear

The reference for MOVNTDQA says: For WC memory type, the nontemporal hint may be implemented by loading a temporary internal buffer with the equivalent of an aligned cache line without filling this data to the cache. [...] Subsequent MOVNTDQA reads to unread portions of the WC cache line will receive data from the temporary internal buffer if data is available. This hidden cache line sized temporary buffer can improve the read performance from wc maps. v2: Add mfence at start of tiled_to_linear for streaming loads (Chris) v3: add Android build support (Tapani) v4: squash 'fix i915: Fix streaming loads for intel_tiled_memcpy' separate sse41 to own static library (Tapani) Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> (v2) Reviewed-by: Matt Turner <mattst88@gmail.com> (v2) Acked-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com> Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
2018-09-24 08:33:06 +03:00
parent 91d3a5d1a8
commit 11b1afdc92
9 changed files with 426 additions and 90 deletions
--- a/src/mesa/drivers/dri/i965/Android.mk
+++ b/src/mesa/drivers/dri/i965/Android.mk
@@ -51,6 +51,42 @@ I965_PERGEN_LIBS := \
 	libmesa_i965_gen10 \
 	libmesa_i965_gen11

+
+# ---------------------------------------
+# Build libmesa_intel_tiled_memcpy
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_intel_tiled_memcpy
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(intel_tiled_memcpy_FILES)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+# ---------------------------------------
+# Build libmesa_intel_tiled_memcpy_sse41
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_intel_tiled_memcpy_sse41
+
+LOCAL_C_INCLUDES := $(I965_PERGEN_COMMON_INCLUDES)
+
+LOCAL_SRC_FILES := $(intel_tiled_memcpy_sse41_FILES)
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_CFLAGS += \
+	-DUSE_SSE41 -msse4.1 -mstackrealign
+endif
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
 # ---------------------------------------
 # Build libmesa_i965_gen4
 # ---------------------------------------
@@ -289,6 +325,8 @@ LOCAL_SRC_FILES := \
 LOCAL_WHOLE_STATIC_LIBRARIES := \
 	$(MESA_DRI_WHOLE_STATIC_LIBRARIES) \
 	$(I965_PERGEN_LIBS) \
+	libmesa_intel_tiled_memcpy \
+	libmesa_intel_tiled_memcpy_sse41 \
 	libmesa_intel_dev \
 	libmesa_intel_common \
 	libmesa_isl \
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,20 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110

 noinst_LTLIBRARIES = \
 	libi965_dri.la \
+	libintel_tiled_memcpy.la \
+	libintel_tiled_memcpy_sse41.la \
 	$(I965_PERGEN_LIBS)

+libintel_tiled_memcpy_la_SOURCES = \
+	$(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+	$(AM_CFLAGS)
+
+libintel_tiled_memcpy_sse41_la_SOURCES = \
+	$(intel_tiled_memcpy_sse41_FILES)
+libintel_tiled_memcpy_sse41_la_CFLAGS = \
+	$(AM_CFLAGS) $(SSE41_CFLAGS)
+
 libi965_dri_la_SOURCES = \
 	$(i965_FILES) \
 	$(i965_oa_GENERATED_FILES)
@@ -104,6 +116,8 @@ libi965_dri_la_LIBADD = \
 	$(top_builddir)/src/intel/compiler/libintel_compiler.la \
 	$(top_builddir)/src/intel/blorp/libblorp.la \
 	$(I965_PERGEN_LIBS) \
+	libintel_tiled_memcpy.la \
+	libintel_tiled_memcpy_sse41.la \
 	$(LIBDRM_LIBS)

 BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,17 @@ i965_FILES = \
 	intel_tex_image.c \
 	intel_tex_obj.h \
 	intel_tex_validate.c \
-	intel_tiled_memcpy.c \
-	intel_tiled_memcpy.h \
 	intel_upload.c \
 	libdrm_macros.h

+intel_tiled_memcpy_FILES = \
+	intel_tiled_memcpy_normal.c \
+	intel_tiled_memcpy.h
+
+intel_tiled_memcpy_sse41_FILES = \
+	intel_tiled_memcpy_sse41.c \
+	intel_tiled_memcpy_sse41.h
+
 i965_gen4_FILES = \
 	genX_blorp_exec.c \
 	genX_state_upload.c
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -566,6 +566,31 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
   }
 }

+#if defined(INLINE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+   if (count == 16) {
+      __m128i val = _mm_stream_load_si128((__m128i *)src);
+      _mm_storeu_si128((__m128i *)dest, val);
+      return dest;
+   } else if (count == 64) {
+      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
+      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
+      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
+      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
+      return dest;
+   } else {
+      assert(count < 64); /* and (count < 16) for ytiled */
+      return memcpy(dest, src, count);
+   }
+}
+#endif
+
 static mem_copy_fn
 choose_copy_function(mem_copy_fn_type copy_type)
 {
@@ -574,6 +599,10 @@ choose_copy_function(mem_copy_fn_type copy_type)
      return memcpy;
   case INTEL_COPY_RGBA8:
      return rgba8_copy;
+#if defined(INLINE_SSE41)
+   case INTEL_COPY_STREAMING_LOAD:
+      return _memcpy_streaming_load;
+#endif
   default:
      assert(!"unreachable");
   }
@@ -696,6 +725,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                 dst, src, dst_pitch, swizzle_bit,
                                 rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (mem_copy == _memcpy_streaming_load)
+         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
      else
         unreachable("not reached");
   } else {
@@ -706,6 +741,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit,
                                 rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (mem_copy == _memcpy_streaming_load)
+         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
      else
         unreachable("not reached");
   }
@@ -740,6 +781,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                 dst, src, dst_pitch, swizzle_bit,
                                 rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (copy_type == INTEL_COPY_STREAMING_LOAD)
+         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
      else
         unreachable("not reached");
   } else {
@@ -750,6 +797,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                 dst, src, dst_pitch, swizzle_bit,
                                 rgba8_copy, rgba8_copy_aligned_src);
+#if defined(INLINE_SSE41)
+      else if (copy_type == INTEL_COPY_STREAMING_LOAD)
+         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+                                 dst, src, dst_pitch, swizzle_bit,
+                                 memcpy, _memcpy_streaming_load);
+#endif
      else
         unreachable("not reached");
   }
@@ -768,14 +821,14 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
 * 'dst' is the address of (0, 0) in the destination tiled texture.
 * 'src' is the address of (xt1, yt1) in the source linear texture.
 */
-void
-linear_to_tiled(uint32_t xt1, uint32_t xt2,
-                uint32_t yt1, uint32_t yt2,
-                char *dst, const char *src,
-                uint32_t dst_pitch, int32_t src_pitch,
-                bool has_swizzling,
-                enum isl_tiling tiling,
-                mem_copy_fn_type copy_type)
+static void
+intel_linear_to_tiled(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
 {
   tile_copy_fn tile_copy;
   uint32_t xt0, xt3;
@@ -859,14 +912,14 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
 * 'src' is the address of (0, 0) in the source tiled texture.
 */
-void
-tiled_to_linear(uint32_t xt1, uint32_t xt2,
-                uint32_t yt1, uint32_t yt2,
-                char *dst, const char *src,
-                int32_t dst_pitch, uint32_t src_pitch,
-                bool has_swizzling,
-                enum isl_tiling tiling,
-                mem_copy_fn_type copy_type)
+static void
+intel_tiled_to_linear(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      int32_t dst_pitch, uint32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
 {
   tile_copy_fn tile_copy;
   uint32_t xt0, xt3;
@@ -889,6 +942,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
      unreachable("unsupported tiling");
   }

+#if defined(INLINE_SSE41)
+   if (copy_type == INTEL_COPY_STREAMING_LOAD) {
+      /* The hidden cacheline sized register used by movntdqa can apparently
+       * give you stale data, so do an mfence to invalidate it.
+       */
+      _mm_mfence();
+   }
+#endif
+
   /* Round out to tile boundaries. */
   xt0 = ALIGN_DOWN(xt1, tw);
   xt3 = ALIGN_UP  (xt2, tw);
@@ -938,69 +1000,3 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
      }
   }
 }
-
-
-/**
- * Determine which copy function to use for the given format combination
- *
- * The only two possible copy functions which are ever returned are a
- * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
- * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
- * symmetric), it doesn't matter whether the copy is from the tiled image
- * to the untiled or vice versa.  The copy function required is the same in
- * either case so this function can be used.
- *
- * \param[in]  tiledFormat The format of the tiled image
- * \param[in]  format      The GL format of the client data
- * \param[in]  type        The GL type of the client data
- * \param[out] mem_copy    Will be set to one of either the standard
- *                         library's memcpy or a different copy function
- *                         that performs an RGBA to BGRA conversion
- * \param[out] cpp         Number of bytes per channel
- *
- * \return true if the format and type combination are valid
- */
-bool
-intel_get_memcpy_type(mesa_format tiledFormat, GLenum format, GLenum type,
-                      mem_copy_fn_type *copy_type, uint32_t *cpp)
-{
-   *copy_type = INTEL_COPY_INVALID;
-
-   if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
-       !(format == GL_RGBA || format == GL_BGRA))
-      return false; /* Invalid type/format combination */
-
-   if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
-       (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
-      *cpp = 1;
-      *copy_type = INTEL_COPY_MEMCPY;
-   } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
-              (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
-      *cpp = 4;
-      if (format == GL_BGRA) {
-         *copy_type = INTEL_COPY_MEMCPY;
-      } else if (format == GL_RGBA) {
-         *copy_type = INTEL_COPY_RGBA8;
-      }
-   } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
-              (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
-      *cpp = 4;
-      if (format == GL_BGRA) {
-         /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
-          * use the same function.
-          */
-         *copy_type = INTEL_COPY_RGBA8;
-      } else if (format == GL_RGBA) {
-         *copy_type = INTEL_COPY_MEMCPY;
-      }
-   }
-
-   if (*copy_type == INTEL_COPY_INVALID)
-      return false;
-
-   return true;
-}
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -38,11 +38,21 @@
 typedef enum {
  INTEL_COPY_MEMCPY = 0,
  INTEL_COPY_RGBA8,
+  INTEL_COPY_STREAMING_LOAD,
  INTEL_COPY_INVALID,
 } mem_copy_fn_type;

 typedef void *(*mem_copy_fn)(void *dest, const void *src, size_t n);

+typedef void (*tiled_to_linear_fn)
+   (uint32_t xt1, uint32_t xt2,
+    uint32_t yt1, uint32_t yt2,
+    char *dst, const char *src,
+    int32_t dst_pitch, uint32_t src_pitch,
+    bool has_swizzling,
+    enum isl_tiling tiling,
+    mem_copy_fn_type copy_type);
+
 void
 linear_to_tiled(uint32_t xt1, uint32_t xt2,
                uint32_t yt1, uint32_t yt2,
@@ -61,8 +71,69 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
                enum isl_tiling tiling,
                mem_copy_fn_type copy_type);

-bool intel_get_memcpy_type(mesa_format tiledFormat, GLenum format,
-                           GLenum type, mem_copy_fn_type *copy_type,
-                           uint32_t *cpp);
+/**
+ * Determine which copy function to use for the given format combination
+ *
+ * The only two possible copy functions which are ever returned are a
+ * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
+ * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
+ * symmetric), it doesn't matter whether the copy is from the tiled image
+ * to the untiled or vice versa.  The copy function required is the same in
+ * either case so this function can be used.
+ *
+ * \param[in]  tiledFormat The format of the tiled image
+ * \param[in]  format      The GL format of the client data
+ * \param[in]  type        The GL type of the client data
+ * \param[out] mem_copy    Will be set to one of either the standard
+ *                         library's memcpy or a different copy function
+ *                         that performs an RGBA to BGRA conversion
+ * \param[out] cpp         Number of bytes per channel
+ *
+ * \return true if the format and type combination are valid
+ */
+static MAYBE_UNUSED bool
+intel_get_memcpy_type(mesa_format tiledFormat, GLenum format, GLenum type,
+                      mem_copy_fn_type *copy_type, uint32_t *cpp)
+{
+   *copy_type = INTEL_COPY_INVALID;
+
+   if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
+       !(format == GL_RGBA || format == GL_BGRA))
+      return false; /* Invalid type/format combination */
+
+   if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
+       (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
+      *cpp = 1;
+      *copy_type = INTEL_COPY_MEMCPY;
+   } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
+              (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
+      *cpp = 4;
+      if (format == GL_BGRA) {
+         *copy_type = INTEL_COPY_MEMCPY;
+      } else if (format == GL_RGBA) {
+         *copy_type = INTEL_COPY_RGBA8;
+      }
+   } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
+              (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
+      *cpp = 4;
+      if (format == GL_BGRA) {
+         /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
+          * use the same function.
+          */
+         *copy_type = INTEL_COPY_RGBA8;
+      } else if (format == GL_RGBA) {
+         *copy_type = INTEL_COPY_MEMCPY;
+      }
+   }
+
+   if (*copy_type == INTEL_COPY_INVALID)
+      return false;
+
+   return true;
+}

 #endif /* INTEL_TILED_MEMCPY */
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_normal.c
@@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace@linux.intel.com>
+ *    Frank Henigman <fjhenigman@google.com>
+ */
+
+
+#include "intel_tiled_memcpy.c"
+
+void
+linear_to_tiled(uint32_t xt1, uint32_t xt2,
+                uint32_t yt1, uint32_t yt2,
+                char *dst, const char *src,
+                uint32_t dst_pitch, int32_t src_pitch,
+                bool has_swizzling,
+                enum isl_tiling tiling,
+                mem_copy_fn_type copy_type)
+{
+   intel_linear_to_tiled(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
+
+void
+tiled_to_linear(uint32_t xt1, uint32_t xt2,
+                uint32_t yt1, uint32_t yt2,
+                char *dst, const char *src,
+                int32_t dst_pitch, uint32_t src_pitch,
+                bool has_swizzling,
+                enum isl_tiling tiling,
+                mem_copy_fn_type copy_type)
+{
+   intel_tiled_to_linear(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.c
@@ -0,0 +1,61 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace@linux.intel.com>
+ *    Frank Henigman <fjhenigman@google.com>
+ */
+
+#define INLINE_SSE41
+
+#include "intel_tiled_memcpy_sse41.h"
+#include "intel_tiled_memcpy.c"
+
+void
+linear_to_tiled_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type)
+{
+   intel_linear_to_tiled(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
+
+void
+tiled_to_linear_sse41(uint32_t xt1, uint32_t xt2,
+                     uint32_t yt1, uint32_t yt2,
+                     char *dst, const char *src,
+                     int32_t dst_pitch, uint32_t src_pitch,
+                     bool has_swizzling,
+                     enum isl_tiling tiling,
+                     mem_copy_fn_type copy_type)
+{
+   intel_tiled_to_linear(xt1, xt2, yt1, yt2, dst, src, dst_pitch, src_pitch,
+                         has_swizzling, tiling, copy_type);
+}
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy_sse41.h
@@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright 2012 Intel Corporation
+ * Copyright 2013 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chad Versace <chad.versace@linux.intel.com>
+ *    Frank Henigman <fjhenigman@google.com>
+ */
+
+#ifndef INTEL_TILED_MEMCPY_SSE41_H
+#define INTEL_TILED_MEMCPY_SSE41_H
+
+#include <stdint.h>
+#include "main/mtypes.h"
+#include "isl/isl.h"
+
+#include "intel_tiled_memcpy.h"
+
+void
+linear_to_tiled_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      uint32_t dst_pitch, int32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type);
+
+void
+tiled_to_linear_sse41(uint32_t xt1, uint32_t xt2,
+                      uint32_t yt1, uint32_t yt2,
+                      char *dst, const char *src,
+                      int32_t dst_pitch, uint32_t src_pitch,
+                      bool has_swizzling,
+                      enum isl_tiling tiling,
+                      mem_copy_fn_type copy_type);
+
+#endif /* INTEL_TILED_MEMCPY_SSE41_H */
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,20 @@ files_i965 = files(
  'intel_tex_image.c',
  'intel_tex_obj.h',
  'intel_tex_validate.c',
-  'intel_tiled_memcpy.c',
-  'intel_tiled_memcpy.h',
  'intel_upload.c',
  'libdrm_macros.h',
 )

+files_intel_tiled_memcpy = files(
+  'intel_tiled_memcpy_normal.c',
+  'intel_tiled_memcpy.h',
+)
+
+files_intel_tiled_memcpy_sse41 = files(
+  'intel_tiled_memcpy_sse41.c',
+  'intel_tiled_memcpy_sse41.h',
+)
+
 i965_gen_libs = []
 foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
  i965_gen_libs += static_library(
@@ -176,6 +184,30 @@ i965_oa_sources = custom_target(
  ],
 )

+intel_tiled_memcpy = static_library(
+  'intel_tiled_memcpy',
+  [files_intel_tiled_memcpy],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  c_args : [c_vis_args, no_override_init_args, '-msse2'],
+)
+
+if with_sse41
+intel_tiled_memcpy_sse41 = static_library(
+  'intel_tiled_memcpy_sse41',
+  [files_intel_tiled_memcpy_sse41],
+  include_directories : [
+    inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+  ],
+  link_args : [ '-Wl,--exclude-libs=ALL' ],
+  c_args : [c_vis_args, no_override_init_args, '-Wl,--exclude-libs=ALL', '-msse2', sse41_args],
+)
+else
+intel_tiled_memcpy_sse41 = []
+endif
+
+
 libi965 = static_library(
  'i965',
  [files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +219,7 @@ libi965 = static_library(
  cpp_args : [cpp_vis_args, c_sse2_args],
  link_with : [
    i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
-    libblorp,
+    libblorp, intel_tiled_memcpy, intel_tiled_memcpy_sse41
  ],
  dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
 )