diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index ceba9bca193..50694e4374e 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -1327,6 +1327,7 @@ init_device_meta(struct v3dv_device *device)
    mtx_init(&device->meta.mtx, mtx_plain);
    v3dv_meta_clear_init(device);
    v3dv_meta_blit_init(device);
+   v3dv_meta_texel_buffer_copy_init(device);
 }
 
 static void
@@ -1335,6 +1336,7 @@ destroy_device_meta(struct v3dv_device *device)
    mtx_destroy(&device->meta.mtx);
    v3dv_meta_clear_finish(device);
    v3dv_meta_blit_finish(device);
+   v3dv_meta_texel_buffer_copy_finish(device);
 }
 
 VkResult
diff --git a/src/broadcom/vulkan/v3dv_formats.c b/src/broadcom/vulkan/v3dv_formats.c
index ac8b8e58f54..e316ab142bd 100644
--- a/src/broadcom/vulkan/v3dv_formats.c
+++ b/src/broadcom/vulkan/v3dv_formats.c
@@ -505,6 +505,16 @@ buffer_format_features(VkFormat vk_format, const struct v3dv_format *v3dv_format
    return flags;
 }
 
+bool
+v3dv_buffer_format_supports_features(VkFormat vk_format,
+                                     VkFormatFeatureFlags features)
+{
+   const struct v3dv_format *v3dv_format = v3dv_get_format(vk_format);
+   const VkFormatFeatureFlags supported =
+      buffer_format_features(vk_format, v3dv_format);
+   return (supported & features) == features;
+}
+
 void
 v3dv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice,
                                        VkFormat format,
diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
index 98429081846..da91af994e1 100644
--- a/src/broadcom/vulkan/v3dv_meta_copy.c
+++ b/src/broadcom/vulkan/v3dv_meta_copy.c
@@ -78,6 +78,56 @@ v3dv_meta_blit_finish(struct v3dv_device *device)
    }
 }
 
+static uint32_t
+meta_texel_buffer_copy_key_hash(const void *key)
+{
+   return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+}
+
+static bool
+meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
+{
+   return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
+}
+
+void
+v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
+{
+   for (uint32_t i = 0; i < 3; i++) {
+      device->meta.texel_buffer_copy.cache[i] =
+         _mesa_hash_table_create(NULL,
+                                 meta_texel_buffer_copy_key_hash,
+                                 meta_texel_buffer_copy_key_compare);
+   }
+}
+
+void
+v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
+{
+   VkDevice _device = v3dv_device_to_handle(device);
+
+   for (uint32_t i = 0; i < 3; i++) {
+      hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
+         struct v3dv_meta_texel_buffer_copy_pipeline *item = entry->data;
+         v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
+         v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
+         vk_free(&device->alloc, item);
+      }
+      _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
+   }
+
+   if (device->meta.texel_buffer_copy.playout) {
+      v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.playout,
+                                 &device->alloc);
+   }
+
+   if (device->meta.texel_buffer_copy.dslayout) {
+      v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.dslayout,
+                                      &device->alloc);
+   }
+}
+
 static inline bool
 can_use_tlb(struct v3dv_image *image,
             const VkOffset3D *offset,
@@ -2521,118 +2571,718 @@ create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
    return false;
 }
-/**
- * Returns true if the implementation supports the requested operation (even if
- * it failed to process it, for example, due to an out-of-memory error).
- */
+
 static bool
-copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
-                          struct v3dv_image *image,
-                          struct v3dv_buffer *buffer,
-                          const VkBufferImageCopy *region)
+create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
+                                         VkDescriptorSetLayout *ds_layout,
+                                         VkPipelineLayout *p_layout)
 {
-   bool handled = false;
+   VkResult result;
 
-   /* Generally, the bpp of the data in the buffer matches that of the
-    * destination image. The exception is the case where we are uploading
-    * stencil (8bpp) to a combined d24s8 image (32bpp).
-    */
-   uint32_t buffer_bpp = image->cpp;
-
-   VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
-
-   /* We are about to upload the buffer data to an image so we can then
-    * blit that to our destination region. Because we are going to implement
-    * the copy as a blit, we want our blit source and destination formats to be
-    * the same (to avoid any format conversions), so we choose a canonical
-    * format that matches the destination image bpp.
-    */
-   VkColorComponentFlags cmask = 0; /* Write all components */
-   VkFormat src_format;
-   VkFormat dst_format;
-   switch (buffer_bpp) {
-   case 16:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R32G32B32A32_UINT;
-      dst_format = src_format;
-      break;
-   case 8:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R16G16B16A16_UINT;
-      dst_format = src_format;
-      break;
-   case 4:
-      switch (aspect) {
-      case VK_IMAGE_ASPECT_COLOR_BIT:
-         src_format = VK_FORMAT_R8G8B8A8_UINT;
-         dst_format = src_format;
-         break;
-      case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
-         if (image->tiling != VK_IMAGE_TILING_LINEAR) {
-            src_format = image->vk_format;
-         } else {
-            src_format = VK_FORMAT_R8G8B8A8_UINT;
-            aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-            if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
-               cmask = VK_COLOR_COMPONENT_R_BIT |
-                       VK_COLOR_COMPONENT_G_BIT |
-                       VK_COLOR_COMPONENT_B_BIT;
-            }
-         }
-         dst_format = src_format;
-         break;
-      case VK_IMAGE_ASPECT_STENCIL_BIT:
-         /* Since we don't support separate stencil this is always a stencil
-          * copy to a combined depth/stencil image. Becasue we don't support
-          * separate stencil images, we upload the buffer data to a compatible
-          * color R8UI image, and implement the blit as a compatible color
-          * blit to an RGBA8UI destination masking out writes to components
-          * GBA (which map to the D24 component of a S8D24 image).
-          */
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
-         buffer_bpp = 1;
-         src_format = VK_FORMAT_R8_UINT;
-         dst_format = VK_FORMAT_R8G8B8A8_UINT;
-         cmask = VK_COLOR_COMPONENT_R_BIT;
-         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-         break;
-      default:
-         unreachable("unsupported aspect");
-         return handled;
+   if (*ds_layout == 0) {
+      VkDescriptorSetLayoutBinding ds_layout_binding = {
+         .binding = 0,
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
       };
-      break;
-   case 2:
-      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
-      src_format = VK_FORMAT_R16_UINT;
-      dst_format = src_format;
-      break;
-   case 1:
-      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
-      src_format = VK_FORMAT_R8_UINT;
-      dst_format = src_format;
-      break;
-   default:
-      unreachable("unsupported bit-size");
-      return handled;
+      VkDescriptorSetLayoutCreateInfo ds_layout_info = {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+         .bindingCount = 1,
+         .pBindings = &ds_layout_binding,
+      };
+      result =
+         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
+                                        &ds_layout_info,
+                                        &device->alloc,
+                                        ds_layout);
+      if (result != VK_SUCCESS)
+         return false;
    }
 
-   /* We should be able to handle the blit if we reached here */
+   assert(*p_layout == 0);
+   VkPipelineLayoutCreateInfo p_layout_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = ds_layout,
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges =
+         &(VkPushConstantRange) { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 20 },
+   };
+
+   result =
+      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
+                                &p_layout_info,
+                                &device->alloc,
+                                p_layout);
+   return result == VK_SUCCESS;
+}
+
+static VkResult
+create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
+{
+   /* If this is not the first pool we create for this command buffer
+    * size it based on the size of the currently exhausted pool.
+    */
+   uint32_t descriptor_count = 64;
+   if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
+      struct v3dv_descriptor_pool *exhausted_pool =
+         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
+      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
+   }
+
+   /* Create the descriptor pool */
+   cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
+   VkDescriptorPoolSize pool_size = {
+      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+      .descriptorCount = descriptor_count,
+   };
+   VkDescriptorPoolCreateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .maxSets = descriptor_count,
+      .poolSizeCount = 1,
+      .pPoolSizes = &pool_size,
+      .flags = 0,
+   };
+   VkResult result =
+      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
+                                &info,
+                                &cmd_buffer->device->alloc,
+                                &cmd_buffer->meta.texel_buffer_copy.dspool);
+
+   if (result == VK_SUCCESS) {
+      assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t)cmd_buffer->meta.texel_buffer_copy.dspool,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
+   }
+
+   return result;
+}
+
+static VkResult
+allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
+                                          VkDescriptorSet *set)
+{
+   /* Make sure we have a descriptor pool */
+   VkResult result;
+   if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
+      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+   assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
+
+   /* Allocate descriptor set */
+   struct v3dv_device *device = cmd_buffer->device;
+   VkDevice _device = v3dv_device_to_handle(device);
+   VkDescriptorSetAllocateInfo info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &device->meta.texel_buffer_copy.dslayout,
+   };
+   result = v3dv_AllocateDescriptorSets(_device, &info, set);
+
+   /* If we ran out of pool space, grow the pool and try again */
+   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
+      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
+      if (result == VK_SUCCESS) {
+         info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
+         result = v3dv_AllocateDescriptorSets(_device, &info, set);
+      }
+   }
+
+   return result;
+}
+
+static void
+get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
+                                         uint8_t *key)
+{
+   memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+
+   uint32_t *p = (uint32_t *) key;
+
+   *p = format;
+   p++;
+
+   assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
+}
+
+static bool
+create_blit_render_pass(struct v3dv_device *device,
+                        VkFormat dst_format,
+                        VkFormat src_format,
+                        VkRenderPass *pass_load,
+                        VkRenderPass *pass_no_load);
+
+static nir_ssa_def *gen_rect_vertices(nir_builder *b);
+
+static bool
+create_pipeline(struct v3dv_device *device,
+                struct v3dv_render_pass *pass,
+                struct nir_shader *vs_nir,
+                struct nir_shader *fs_nir,
+                const VkPipelineVertexInputStateCreateInfo *vi_state,
+                const VkPipelineDepthStencilStateCreateInfo *ds_state,
+                const VkPipelineColorBlendStateCreateInfo *cb_state,
+                const VkPipelineMultisampleStateCreateInfo *ms_state,
+                const VkPipelineLayout layout,
+                VkPipeline *pipeline);
+
+static nir_shader *
+get_texel_buffer_copy_vs()
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
+                                                  "meta texel buffer copy vs");
+   nir_variable *vs_out_pos =
+      nir_variable_create(b.shader, nir_var_shader_out,
+                          glsl_vec4_type(), "gl_Position");
+   vs_out_pos->data.location = VARYING_SLOT_POS;
+
+   nir_ssa_def *pos = gen_rect_vertices(&b);
+   nir_store_var(&b, vs_out_pos, pos, 0xf);
+
+   return b.shader;
+}
+
+static nir_ssa_def *
+load_frag_coord(nir_builder *b)
+{
+   nir_foreach_shader_in_variable(var, b->shader) {
+      if (var->data.location == VARYING_SLOT_POS)
+         return nir_load_var(b, var);
+   }
+   nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
+                                           glsl_vec4_type(), NULL);
+   pos->data.location = VARYING_SLOT_POS;
+   return nir_load_var(b, pos);
+}
+
+static nir_shader *
+get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format)
+{
+   const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
+                                                  "meta texel buffer copy fs");
+
+   /* We only use the copy from texel buffer shader to implement
+    * copy_buffer_to_image_shader, which always selects a compatible integer
+    * format for the copy.
+    */
+   assert(vk_format_is_int(format));
+
+   /* Fragment shader output color */
+   nir_variable *fs_out_color =
+      nir_variable_create(b.shader, nir_var_shader_out,
+                          glsl_uvec4_type(), "out_color");
+   fs_out_color->data.location = FRAG_RESULT_DATA0;
+
+   /* Texel buffer input */
+   const struct glsl_type *sampler_type =
+      glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
+   nir_variable *sampler =
+      nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
+   sampler->data.descriptor_set = 0;
+   sampler->data.binding = 0;
+
+   /* Load the box describing the pixel region we want to copy from the
+    * texel buffer.
+    */
+   nir_intrinsic_instr *box =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   box->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(box, 0);
+   nir_intrinsic_set_range(box, 16);
+   box->num_components = 4;
+   nir_ssa_dest_init(&box->instr, &box->dest, 4, 32, "box");
+   nir_builder_instr_insert(&b, &box->instr);
+
+   /* Load the buffer stride (this comes in texel units) */
+   nir_intrinsic_instr *stride =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   stride->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(stride, 16);
+   nir_intrinsic_set_range(stride, 4);
+   stride->num_components = 1;
+   nir_ssa_dest_init(&stride->instr, &stride->dest, 1, 32, "buffer stride");
+   nir_builder_instr_insert(&b, &stride->instr);
+
+   /* Load the buffer offset (this comes in texel units) */
+   nir_intrinsic_instr *offset =
+      nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_push_constant);
+   offset->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
+   nir_intrinsic_set_base(offset, 20);
+   nir_intrinsic_set_range(offset, 4);
+   offset->num_components = 1;
+   nir_ssa_dest_init(&offset->instr, &offset->dest, 1, 32, "buffer offset");
+   nir_builder_instr_insert(&b, &offset->instr);
+
+   /* Pixel coordinate must be within the box, otherwise our buffer offsets
+    * could be out of bounds.
+    */
+   nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
+   nir_ssa_def *cond =
+      nir_iand(&b,
+         nir_iand(&b, nir_ige(&b, nir_channel(&b, coord, 0),
+                                  nir_channel(&b, &box->dest.ssa, 0)),
+                      nir_ige(&b, nir_channel(&b, coord, 1),
+                                  nir_channel(&b, &box->dest.ssa, 1))),
+         nir_iand(&b, nir_ige(&b, nir_channel(&b, &box->dest.ssa, 2),
+                                  nir_channel(&b, coord, 0)),
+                      nir_ige(&b, nir_channel(&b, &box->dest.ssa, 3),
+                                  nir_channel(&b, coord, 1))));
+
+   nir_if *if_stmt = nir_push_if(&b, cond);
+      /* Load pixel data from texel buffer based on the x,y offset of the pixel
+       * within the box. Texel buffers are 1D arrays of texels.
+       */
+      nir_ssa_def *x_offset =
+         nir_isub(&b, nir_channel(&b, coord, 0),
+                      nir_channel(&b, &box->dest.ssa, 0));
+      nir_ssa_def *y_offset =
+         nir_isub(&b, nir_channel(&b, coord, 1),
+                      nir_channel(&b, &box->dest.ssa, 1));
+      nir_ssa_def *texel_offset =
+         nir_iadd(&b, nir_iadd(&b, &offset->dest.ssa, x_offset),
+                      nir_imul(&b, y_offset, &stride->dest.ssa));
+
+      nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+      nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+      tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
+      tex->op = nir_texop_txf;
+      tex->src[0].src_type = nir_tex_src_coord;
+      tex->src[0].src = nir_src_for_ssa(texel_offset);
+      tex->src[1].src_type = nir_tex_src_texture_deref;
+      tex->src[1].src = nir_src_for_ssa(tex_deref);
+      tex->dest_type = nir_type_uint;
+      tex->is_array = false;
+      tex->coord_components = 1;
+      nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
+      nir_builder_instr_insert(&b, &tex->instr);
+
+      nir_store_var(&b, fs_out_color, &tex->dest.ssa, 0xf);
+   nir_pop_if(&b, if_stmt);
+
+   return b.shader;
+}
+
+static bool
+create_texel_buffer_copy_pipeline(struct v3dv_device *device,
+                                  VkFormat format,
+                                  VkRenderPass _pass,
+                                  VkPipelineLayout pipeline_layout,
+                                  VkPipeline *pipeline)
+{
+   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
+
+   assert(vk_format_is_color(format));
+
+   nir_shader *vs_nir = get_texel_buffer_copy_vs();
+   nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format);
+
+   const VkPipelineVertexInputStateCreateInfo vi_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+      .vertexBindingDescriptionCount = 0,
+      .vertexAttributeDescriptionCount = 0,
+   };
+
+   VkPipelineDepthStencilStateCreateInfo ds_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
+   };
+
+   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
+   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
+      .blendEnable = false,
+      .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                        VK_COLOR_COMPONENT_G_BIT |
+                        VK_COLOR_COMPONENT_B_BIT |
+                        VK_COLOR_COMPONENT_A_BIT,
+   };
+
+   const VkPipelineColorBlendStateCreateInfo cb_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+      .logicOpEnable = false,
+      .attachmentCount = 1,
+      .pAttachments = blend_att_state
+   };
+
+   const VkPipelineMultisampleStateCreateInfo ms_state = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+      .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+      .sampleShadingEnable = false,
+      .pSampleMask = NULL,
+      .alphaToCoverageEnable = false,
+      .alphaToOneEnable = false,
+   };
+
+   return create_pipeline(device,
+                          pass,
+                          vs_nir, fs_nir,
+                          &vi_state,
+                          &ds_state,
+                          &cb_state,
+                          &ms_state,
+                          pipeline_layout,
+                          pipeline);
+}
+
+static bool
+get_copy_texel_buffer_pipeline(
+   struct v3dv_device *device,
+   VkFormat format,
+   VkImageType image_type,
+   struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
+{
+   bool ok = true;
+
+   mtx_lock(&device->meta.mtx);
+   if (!device->meta.texel_buffer_copy.playout) {
+      ok = create_texel_buffer_copy_pipeline_layout(
+               device,
+               &device->meta.texel_buffer_copy.dslayout,
+               &device->meta.texel_buffer_copy.playout);
+   }
+   mtx_unlock(&device->meta.mtx);
+   if (!ok)
+      return false;
+
+   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
+   get_texel_buffer_copy_pipeline_cache_key(format, key);
+   mtx_lock(&device->meta.mtx);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
+                              &key);
+   if (entry) {
+      mtx_unlock(&device->meta.mtx);
+      *pipeline = entry->data;
+      return true;
+   }
+
+   *pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(**pipeline), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (*pipeline == NULL)
+      goto fail;
+
+   /* The blit render pass is compatible */
+   ok = create_blit_render_pass(device, format, format,
+                                &(*pipeline)->pass,
+                                &(*pipeline)->pass_no_load);
+   if (!ok)
+      goto fail;
+
+   ok =
+      create_texel_buffer_copy_pipeline(device,
+                                        format,
+                                        (*pipeline)->pass,
+                                        device->meta.texel_buffer_copy.playout,
+                                        &(*pipeline)->pipeline);
+   if (!ok)
+      goto fail;
+
+   _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
+                           &key, *pipeline);
+
+   mtx_unlock(&device->meta.mtx);
+   return true;
+
+fail:
+   mtx_unlock(&device->meta.mtx);
+
+   VkDevice _device = v3dv_device_to_handle(device);
+   if (*pipeline) {
+      if ((*pipeline)->pass)
+         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
+      if ((*pipeline)->pipeline)
+         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
+      vk_free(&device->alloc, *pipeline);
+      *pipeline = NULL;
+   }
+
+   return false;
+}
+
+static bool
+texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
+                         VkImageAspectFlags aspect,
+                         struct v3dv_image *image,
+                         uint32_t num_layers,
+                         VkFormat dst_format,
+                         VkFormat src_format,
+                         struct v3dv_buffer *buffer,
+                         uint32_t buf_width,
+                         uint32_t buf_height,
+                         uint32_t buffer_bpp,
+                         VkColorComponentFlags cmask,
+                         const VkBufferImageCopy *region)
+{
+   VkResult result;
+   bool handled = false;
+
+   /* FIXME: we only only handle exact copies for now. */
+   if (src_format != dst_format)
+      return handled;
+
+   VkFormat format = dst_format;
+
+   /* FIXME: we only handle color copies for now. */
+   if (aspect != VK_IMAGE_ASPECT_COLOR_BIT)
+      return handled;
+
+   /* FIXME: we only handle uncompressed images for now. */
+   if (vk_format_is_compressed(image->vk_format))
+      return handled;
+
+   /* FIXME: support partial color masks */
+   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
+                                            VK_COLOR_COMPONENT_G_BIT |
+                                            VK_COLOR_COMPONENT_B_BIT |
+                                            VK_COLOR_COMPONENT_A_BIT;
+   if (cmask == 0)
+      cmask = full_cmask;
+
+   if (cmask != full_cmask)
+      return handled;
+
+   /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
+    * so we can bind it as a texel buffer. Otherwise, the buffer view
+    * we create below won't setup the texture state that we need for this.
+    */
+   if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
+      if (v3dv_buffer_format_supports_features(
+            format, VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT)) {
+         buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+      } else {
+         return handled;
+      }
+   }
+
+   /* At this point we should be able to handle the copy unless an unexpected
+    * error occurs, such as an OOM.
+    */
    handled = true;
 
-   /* Obtain the 2D buffer region spec */
-   uint32_t buf_width, buf_height;
-   if (region->bufferRowLength == 0)
-      buf_width = region->imageExtent.width;
-   else
-      buf_width = region->bufferRowLength;
+   /* Get the texel buffer copy pipeline */
+   struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
+   bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
+                                            format, image->type, &pipeline);
+   if (!ok)
+      return handled;
+   assert(pipeline && pipeline->pipeline && pipeline->pass);
 
-   if (region->bufferImageHeight == 0)
-      buf_height = region->imageExtent.height;
-   else
-      buf_height = region->bufferImageHeight;
+   /* Setup descriptor set for the source texel buffer. We don't have to
+    * register the descriptor as a private command buffer object since
+    * all descriptors will be freed automatically with the descriptor
+    * pool.
+    */
+   VkDescriptorSet set;
+   result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
+   if (result != VK_SUCCESS)
+      return handled;
+
+   /* FIXME: for some reason passing region->bufferOffset here for the
+    * offset field doesn't work, making the following CTS tests fail:
+    *
+    * dEQP-VK.api.copy_and_blit.core.buffer_to_image.*buffer_offset*
+    *
+    * So instead we pass 0 here and we pass the offset in texels as a push
+    * constant to the shader, which seems to work correctly.
+    */
+   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
+   VkBufferViewCreateInfo buffer_view_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+      .buffer = v3dv_buffer_to_handle(buffer),
+      .format = format,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE,
+   };
+
+   VkBufferView texel_buffer_view;
+   result = v3dv_CreateBufferView(_device, &buffer_view_info,
+                                  &cmd_buffer->device->alloc,
+                                  &texel_buffer_view);
+   if (result != VK_SUCCESS)
+      return handled;
+
+   v3dv_cmd_buffer_add_private_obj(
+      cmd_buffer, (uintptr_t)texel_buffer_view,
+      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
+
+   VkWriteDescriptorSet write = {
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = set,
+      .dstBinding = 0,
+      .dstArrayElement = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+      .pTexelBufferView = &texel_buffer_view,
+   };
+   v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
+
+   /* Push command buffer state before starting meta operation */
+   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
+
+   /* Bind common state for all layers */
+   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
+   v3dv_CmdBindPipeline(_cmd_buffer,
+                        VK_PIPELINE_BIND_POINT_GRAPHICS,
+                        pipeline->pipeline);
+
+   v3dv_CmdBindDescriptorSets(_cmd_buffer,
+                              VK_PIPELINE_BIND_POINT_GRAPHICS,
+                              cmd_buffer->device->meta.texel_buffer_copy.playout,
+                              0, 1, &set,
+                              0, NULL);
+
+   const VkViewport viewport = {
+      .x = region->imageOffset.x,
+      .y = region->imageOffset.y,
+      .width = region->imageExtent.width,
+      .height = region->imageExtent.height,
+      .minDepth = 0.0f,
+      .maxDepth = 1.0f
+   };
+   v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
+   const VkRect2D scissor = {
+      .offset = { region->imageOffset.x, region->imageOffset.y },
+      .extent = { region->imageExtent.width, region->imageExtent.height }
+   };
+   v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
+
+   uint32_t dirty_dynamic_state =
+      V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
+
+   bool can_skip_tlb_load = false;
+   const VkRect2D render_area = {
+     .offset = { region->imageOffset.x, region->imageOffset.y },
+     .extent = { region->imageExtent.width, region->imageExtent.height },
+   };
+
+   /* Record per-layer commands */
+   for (uint32_t i = 0; i < num_layers; i++) {
+      /* Setup framebuffer for this layer.
+       *
+       * FIXME: once we support geometry shaders, we should be able to have
+       *        a single layered framebuffer and emit just one draw call for
+       *        all layers using layered rendering.
+       */
+      VkImageViewCreateInfo image_view_info = {
+         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+         .image = v3dv_image_to_handle(image),
+         .viewType = v3dv_image_type_to_view_type(image->type),
+         .format = format,
+         .subresourceRange = {
+            .aspectMask = aspect,
+            .baseMipLevel = region->imageSubresource.mipLevel,
+            .levelCount = 1,
+            .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
+            .layerCount = 1
+         },
+      };
+      VkImageView image_view;
+      result = v3dv_CreateImageView(_device, &image_view_info,
+                                    &cmd_buffer->device->alloc, &image_view);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      VkFramebufferCreateInfo fb_info = {
+         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+         .renderPass = pipeline->pass,
+         .attachmentCount = 1,
+         .pAttachments = &image_view,
+         .width = u_minify(image->extent.width,
+                           region->imageSubresource.mipLevel),
+         .height = u_minify(image->extent.height,
+                            region->imageSubresource.mipLevel),
+         .layers = 1,
+      };
+
+      VkFramebuffer fb;
+      result = v3dv_CreateFramebuffer(_device, &fb_info,
+                                      &cmd_buffer->device->alloc, &fb);
+      if (result != VK_SUCCESS)
+         goto fail;
+
+      v3dv_cmd_buffer_add_private_obj(
+         cmd_buffer, (uintptr_t)fb,
+         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
+
+      /* If the region we are about to blit is tile-aligned, then we can
+       * use the render pass version that won't pre-load the tile buffer
+       * with the dst image contents before the copy.
+       *
+       * The region is always the same for all layers, so we only need to
+       * compute this once.
+       */
+      if (i == 0) {
+         struct v3dv_render_pass *pipeline_pass =
+            v3dv_render_pass_from_handle(pipeline->pass);
+         can_skip_tlb_load =
+            v3dv_subpass_area_is_tile_aligned(&render_area,
+                                              v3dv_framebuffer_from_handle(fb),
+                                              pipeline_pass, 0);
+      }
+
+      VkRenderPassBeginInfo rp_info = {
+         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
+                                           pipeline->pass,
+         .framebuffer = fb,
+         .renderArea = render_area,
+         .clearValueCount = 0,
+      };
+
+      /* Record draw */
+      v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
+      struct v3dv_job *job = cmd_buffer->state.job;
+      if (!job)
+         goto fail;
+
+      const VkDeviceSize buf_offset =
+         region->bufferOffset / buffer_bpp  + i * buf_height * buf_width;
+      uint32_t push_data[6] = {
+         region->imageOffset.x,
+         region->imageOffset.y,
+         region->imageOffset.x + region->imageExtent.width - 1,
+         region->imageOffset.y + region->imageExtent.height - 1,
+         buf_width,
+         buf_offset,
+      };
+
+      v3dv_CmdPushConstants(_cmd_buffer,
+                            cmd_buffer->device->meta.texel_buffer_copy.playout,
+                            VK_SHADER_STAGE_FRAGMENT_BIT,
+                            0, sizeof(push_data), &push_data);
+
+      v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
+
+      v3dv_CmdEndRenderPass(_cmd_buffer);
+   }
+
+fail:
+   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
+
+   return handled;
+}
+
+static bool
+copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
+                          VkImageAspectFlags aspect,
+                          struct v3dv_image *image,
+                          uint32_t num_layers,
+                          VkFormat dst_format,
+                          VkFormat src_format,
+                          struct v3dv_buffer *buffer,
+                          uint32_t buf_width,
+                          uint32_t buf_height,
+                          uint32_t buffer_bpp,
+                          VkColorComponentFlags cmask,
+                          const VkBufferImageCopy *region)
+{
+   perf_debug("Falling back to blit path for buffer to image copy.\n");
 
    /* If the image is compressed, the bpp refers to blocks, not pixels */
    uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
@@ -2640,29 +3290,15 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
    buf_width = buf_width / block_width;
    buf_height = buf_height / block_height;
 
-   /* Compute layers to copy */
-   uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
-      num_layers = region->imageSubresource.layerCount;
-   else
-      num_layers = region->imageExtent.depth;
-   assert(num_layers > 0);
+   /* We should have configured the blit to use a supported format  */
+   bool handled = true;
 
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
    for (uint32_t i = 0; i < num_layers; i++) {
-      /* Create the source blit image from the source buffer.
-       *
-       * We can't texture from a linear image, so we can't just setup a blit
-       * straight from the buffer contents. Instead, we need to upload the
-       * buffer to a tiled image, and then copy that image to the selected
-       * region of the destination.
-       *
-       * FIXME: we could do better than this is we use a blit shader that has
-       * a UBO (for the buffer) as input instead of a texture. Then we would
-       * have to do some arithmetics in the shader to identify the offset into
-       * the UBO that we need to load for each pixel in the destination image
-       * (we would need to support all the possible copy formats we have above).
+      /* Otherwise, since we can't sample linear images we need to upload the
+       * linear buffer to a tiled image that we can use as a blit source, which
+       * is slow.
        */
       VkImageCreateInfo image_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
@@ -2712,10 +3348,10 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
          return handled;
 
       /* Upload buffer contents for the selected layer */
-      VkDeviceSize buffer_offset =
+      const VkDeviceSize buf_offset_bytes =
          region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
       const VkBufferImageCopy buffer_image_copy = {
-         .bufferOffset = buffer_offset,
+         .bufferOffset = buf_offset_bytes,
          .bufferRowLength = region->bufferRowLength / block_width,
          .bufferImageHeight = region->bufferImageHeight / block_height,
          .imageSubresource = {
@@ -2793,8 +3429,139 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       }
    }
 
-   assert(handled);
-   return true;
+   return handled;
+}
+
+/**
+ * Returns true if the implementation supports the requested operation (even if
+ * it failed to process it, for example, due to an out-of-memory error).
+ */
+static bool
+copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
+                            struct v3dv_image *image,
+                            struct v3dv_buffer *buffer,
+                            const VkBufferImageCopy *region,
+                            bool use_texel_buffer)
+{
+   /* Generally, the bpp of the data in the buffer matches that of the
+    * destination image. The exception is the case where we are uploading
+    * stencil (8bpp) to a combined d24s8 image (32bpp).
+    */
+   uint32_t buf_bpp = image->cpp;
+
+   VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
+
+   /* We are about to upload the buffer data to an image so we can then
+    * blit that to our destination region. Because we are going to implement
+    * the copy as a blit, we want our blit source and destination formats to be
+    * the same (to avoid any format conversions), so we choose a canonical
+    * format that matches the destination image bpp.
+    */
+   VkColorComponentFlags cmask = 0; /* Write all components */
+   VkFormat src_format;
+   VkFormat dst_format;
+   switch (buf_bpp) {
+   case 16:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R32G32B32A32_UINT;
+      dst_format = src_format;
+      break;
+   case 8:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R16G16B16A16_UINT;
+      dst_format = src_format;
+      break;
+   case 4:
+      switch (aspect) {
+      case VK_IMAGE_ASPECT_COLOR_BIT:
+         src_format = VK_FORMAT_R8G8B8A8_UINT;
+         dst_format = src_format;
+         break;
+      case VK_IMAGE_ASPECT_DEPTH_BIT:
+         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
+                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         if (image->tiling != VK_IMAGE_TILING_LINEAR) {
+            src_format = image->vk_format;
+         } else {
+            src_format = VK_FORMAT_R8G8B8A8_UINT;
+            aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+            if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+               cmask = VK_COLOR_COMPONENT_R_BIT |
+                       VK_COLOR_COMPONENT_G_BIT |
+                       VK_COLOR_COMPONENT_B_BIT;
+            }
+         }
+         dst_format = src_format;
+         break;
+      case VK_IMAGE_ASPECT_STENCIL_BIT:
+         /* Since we don't support separate stencil this is always a stencil
+          * copy to a combined depth/stencil image. Becasue we don't support
+          * separate stencil images, we upload the buffer data to a compatible
+          * color R8UI image, and implement the blit as a compatible color
+          * blit to an RGBA8UI destination masking out writes to components
+          * GBA (which map to the D24 component of a S8D24 image).
+          */
+         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         buf_bpp = 1;
+         src_format = VK_FORMAT_R8_UINT;
+         dst_format = VK_FORMAT_R8G8B8A8_UINT;
+         cmask = VK_COLOR_COMPONENT_R_BIT;
+         aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+         break;
+      default:
+         unreachable("unsupported aspect");
+         return false;
+      };
+      break;
+   case 2:
+      aspect = VK_IMAGE_ASPECT_COLOR_BIT;
+      src_format = VK_FORMAT_R16_UINT;
+      dst_format = src_format;
+      break;
+   case 1:
+      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+      src_format = VK_FORMAT_R8_UINT;
+      dst_format = src_format;
+      break;
+   default:
+      unreachable("unsupported bit-size");
+      return false;
+   }
+
+   /* Obtain the 2D buffer region spec */
+   uint32_t buf_width, buf_height;
+   if (region->bufferRowLength == 0)
+      buf_width = region->imageExtent.width;
+   else
+      buf_width = region->bufferRowLength;
+
+   if (region->bufferImageHeight == 0)
+      buf_height = region->imageExtent.height;
+   else
+      buf_height = region->bufferImageHeight;
+
+   /* Compute layers to copy */
+   uint32_t num_layers;
+   if (image->type != VK_IMAGE_TYPE_3D)
+      num_layers = region->imageSubresource.layerCount;
+   else
+      num_layers = region->imageExtent.depth;
+   assert(num_layers > 0);
+
+   if (use_texel_buffer) {
+      return texel_buffer_shader_copy(cmd_buffer, aspect,
+                                      image, num_layers,
+                                      dst_format, src_format,
+                                      buffer, buf_width, buf_height, buf_bpp,
+                                      cmask, region);
+   } else {
+      return copy_buffer_to_image_blit(cmd_buffer, aspect,
+                                       image, num_layers,
+                                       dst_format, src_format,
+                                       buffer, buf_width, buf_height, buf_bpp,
+                                       cmask, region);
+   }
 }
 
 /**
@@ -2882,9 +3649,11 @@ v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
          continue;
       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
+      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, &pRegions[i], true))
+         continue;
       if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[i]))
          continue;
-      if (copy_buffer_to_image_blit(cmd_buffer, image, buffer, &pRegions[i]))
+      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer, &pRegions[i], false))
          continue;
       unreachable("Unsupported buffer to image copy.");
    }
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 2c22b8da6d0..0b474e8b69e 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -174,6 +174,9 @@ void v3dv_meta_clear_finish(struct v3dv_device *device);
 void v3dv_meta_blit_init(struct v3dv_device *device);
 void v3dv_meta_blit_finish(struct v3dv_device *device);
 
+void v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device);
+void v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device);
+
 struct v3dv_app_info {
    const char *app_name;
    uint32_t app_version;
@@ -246,7 +249,8 @@ struct v3dv_queue {
    struct v3dv_job *noop_job;
 };
 
-#define V3DV_META_BLIT_CACHE_KEY_SIZE (4 * sizeof(uint32_t))
+#define V3DV_META_BLIT_CACHE_KEY_SIZE              (4 * sizeof(uint32_t))
+#define V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE (1 * sizeof(uint32_t))
 
 struct v3dv_meta_color_clear_pipeline {
    VkPipeline pipeline;
@@ -267,6 +271,13 @@ struct v3dv_meta_blit_pipeline {
    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
 };
 
+struct v3dv_meta_texel_buffer_copy_pipeline {
+   VkPipeline pipeline;
+   VkRenderPass pass;
+   VkRenderPass pass_no_load;
+   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
+};
+
 struct v3dv_pipeline_cache_stats {
    uint32_t miss;
    uint32_t hit;
@@ -322,6 +333,11 @@ struct v3dv_device {
          VkPipelineLayout playout;
          struct hash_table *cache[3]; /* v3dv_meta_blit_pipeline for 1d, 2d, 3d */
       } blit;
+      struct {
+         VkDescriptorSetLayout dslayout;
+         VkPipelineLayout playout;
+         struct hash_table *cache[3]; /* v3dv_meta_texel_buffer_copy_pipeline for 1d, 2d, 3d */
+      } texel_buffer_copy;
    } meta;
 
    struct v3dv_bo_cache {
@@ -1166,6 +1182,10 @@ struct v3dv_cmd_buffer {
          /* The current descriptor pool for blit sources */
          VkDescriptorPool dspool;
       } blit;
+      struct {
+         /* The current descriptor pool for texel buffer copy sources */
+         VkDescriptorPool dspool;
+      } texel_buffer_copy;
    } meta;
 
    /* List of jobs in the command buffer. For primary command buffers it
@@ -1784,6 +1804,8 @@ void v3dv_get_internal_type_bpp_for_output_format(uint32_t format, uint32_t *typ
 uint8_t v3dv_get_tex_return_size(const struct v3dv_format *vf, bool compare_enable);
 bool v3dv_tfu_supports_tex_format(const struct v3d_device_info *devinfo,
                                   uint32_t tex_format);
+bool v3dv_buffer_format_supports_features(VkFormat vk_format,
+                                          VkFormatFeatureFlags features);
 bool v3dv_format_supports_tlb_resolve(const struct v3dv_format *format);
 
 uint32_t v3d_utile_width(int cpp);