diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h
index 9dbc65a67a3..251fabf52e3 100644
--- a/src/freedreno/vulkan/tu_device.h
+++ b/src/freedreno/vulkan/tu_device.h
@@ -93,6 +93,7 @@ struct tu_physical_device
 
    bool has_cached_coherent_memory;
    bool has_cached_non_coherent_memory;
+   uintptr_t level1_dcache_size;
 
    struct {
       uint32_t type_count;
diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc
index f68341e7994..8f99d60a6d6 100644
--- a/src/freedreno/vulkan/tu_knl_drm_msm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc
@@ -236,6 +236,18 @@ tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info)
    return req.value;
 }
 
+enum tu_mem_sync_op
+{
+   TU_MEM_SYNC_CACHE_TO_GPU,
+   TU_MEM_SYNC_CACHE_FROM_GPU,
+};
+
+void
+sync_cache_bo(struct tu_device *dev,
+              struct tu_bo *bo,
+              VkDeviceSize offset,
+              VkDeviceSize size,
+              enum tu_mem_sync_op op);
 
 static VkResult
 tu_allocate_userspace_iova(struct tu_device *dev,
@@ -416,7 +428,11 @@ msm_bo_init(struct tu_device *dev,
    };
 
    if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
-      req.flags |= MSM_BO_CACHED_COHERENT;
+      if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) {
+         req.flags |= MSM_BO_CACHED_COHERENT;
+      } else {
+         req.flags |= MSM_BO_CACHED;
+      }
    } else {
       req.flags |= MSM_BO_WC;
    }
@@ -443,6 +459,20 @@ msm_bo_init(struct tu_device *dev,
    /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
    tu_bo_set_kernel_name(dev, bo, name);
 
+   if (result == VK_SUCCESS &&
+       (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) &&
+       !(mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
+      tu_bo_map(dev, bo);
+
+      /* Cached non-coherent memory may already have dirty cache lines,
+       * we should clean the cache lines before GPU got the chance to
+       * write into this memory.
+       *
+       * MSM already does this automatically for uncached (MSM_BO_WC) memory.
+       */
+      sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
+   }
+
    return result;
 }
 
@@ -582,12 +612,91 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
    u_rwlock_rdunlock(&dev->dma_bo_lock);
 }
 
+static inline void
+tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
+{
+#if DETECT_ARCH_AARCH64
+   /* Clean data cache. */
+   __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
+#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+   __builtin_ia32_clflush(p);
+#elif DETECT_ARCH_ARM
+   /* DCCMVAC - same as DC CVAC on aarch64.
+    * Seems to be illegal to call from userspace.
+    */
+   //__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
+   unreachable("Cache line clean is unsupported on ARMv7");
+#endif
+}
+
+static inline void
+tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
+{
+#if DETECT_ARCH_AARCH64
+   /* Clean and Invalidate data cache, there is no separate Invalidate. */
+   __asm volatile("dc civac, %0" : : "r" (p) : "memory");
+#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+   __builtin_ia32_clflush(p);
+#elif DETECT_ARCH_ARM
+   /* DCCIMVAC - same as DC CIVAC on aarch64.
+    * Seems to be illegal to call from userspace.
+    */
+   //__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
+   unreachable("Cache line invalidate is unsupported on ARMv7");
+#endif
+}
+
+void
+sync_cache_bo(struct tu_device *dev,
+              struct tu_bo *bo,
+              VkDeviceSize offset,
+              VkDeviceSize size,
+              enum tu_mem_sync_op op)
+{
+   uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
+   char *start = (char *) bo->map + offset;
+   char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
+
+   start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
+
+   for (; start < end; start += level1_dcache_size) {
+      if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
+         tu_sync_cacheline_to_gpu(start);
+      } else {
+         tu_sync_cacheline_from_gpu(start);
+      }
+   }
+}
+
+static VkResult
+sync_cache(VkDevice _device,
+           enum tu_mem_sync_op op,
+           uint32_t count,
+           const VkMappedMemoryRange *ranges)
+{
+   TU_FROM_HANDLE(tu_device, device, _device);
+
+   if (!device->physical_device->has_cached_non_coherent_memory) {
+      tu_finishme(
+         "data cache clean and invalidation are unsupported on this arch!");
+      return VK_SUCCESS;
+   }
+
+   for (uint32_t i = 0; i < count; i++) {
+      TU_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
+      sync_cache_bo(device, mem->bo, ranges[i].offset, ranges[i].size, op);
+   }
+
+   return VK_SUCCESS;
+}
+
 VkResult
 tu_FlushMappedMemoryRanges(VkDevice _device,
                            uint32_t memoryRangeCount,
                            const VkMappedMemoryRange *pMemoryRanges)
 {
-   return VK_SUCCESS;
+   return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
+                     pMemoryRanges);
 }
 
 VkResult
@@ -595,7 +704,8 @@ tu_InvalidateMappedMemoryRanges(VkDevice _device,
                                 uint32_t memoryRangeCount,
                                 const VkMappedMemoryRange *pMemoryRanges)
 {
-   return VK_SUCCESS;
+   return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
+                     pMemoryRanges);
 }
 
 extern const struct vk_sync_type tu_timeline_sync_type;
@@ -1295,7 +1405,14 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
    device->has_cached_coherent_memory =
       (device->msm_minor_version >= 8) &&
       tu_drm_is_memory_type_supported(fd, MSM_BO_CACHED_COHERENT);
-   device->has_cached_non_coherent_memory = false;
+#ifdef _SC_LEVEL1_DCACHE_LINESIZE
+   if (DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64) {
+      long l1_dcache = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+      device->has_cached_non_coherent_memory = l1_dcache > 0;
+      device->level1_dcache_size = l1_dcache;
+   }
+#endif
+
 
    ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count);
    if (ret != 0) {