diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index c7c18078c20..fae8aa1d6e6 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -1004,6 +1004,82 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
    }
 }
 
+static void
+emit_urb_indirect_vec4_write(const fs_builder &bld,
+                             const fs_reg &offset_src,
+                             unsigned base,
+                             const fs_reg &src,
+                             fs_reg urb_handle,
+                             unsigned src_comp_offset,
+                             unsigned dst_comp_offset,
+                             unsigned comps,
+                             unsigned mask)
+{
+   for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
+      fs_builder bld8 = bld.group(8, q);
+
+      fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      bld8.MOV(off, quarter(offset_src, q));
+      bld8.ADD(off, off, brw_imm_ud(base));
+      bld8.SHR(off, off, brw_imm_ud(2));
+
+      fs_reg payload_srcs[4];
+      unsigned length = 0;
+
+      for (unsigned i = 0; i < dst_comp_offset; i++)
+         payload_srcs[length++] = reg_undef;
+
+      for (unsigned c = 0; c < comps; c++)
+         payload_srcs[length++] = quarter(offset(src, bld, c + src_comp_offset), q);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                          BRW_REGISTER_TYPE_F);
+      bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
+
+      fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
+                                reg_undef, srcs, ARRAY_SIZE(srcs));
+      inst->mlen = 3 + length;
+      inst->offset = 0;
+   }
+}
+
+static void
+emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
+                             const fs_reg &src, const fs_reg &offset_src,
+                             fs_reg urb_handle, unsigned mod)
+{
+   assert(nir_src_bit_size(instr->src[0]) == 32);
+
+   const unsigned comps = nir_src_num_components(instr->src[0]);
+   assert(comps <= 4);
+
+   const unsigned mask = nir_intrinsic_write_mask(instr);
+   const unsigned base_in_dwords = nir_intrinsic_base(instr) +
+                                   component_from_intrinsic(instr);
+
+   const unsigned comp_shift   = mod;
+   const unsigned first_comps  = MIN2(comps, 4 - comp_shift);
+   const unsigned second_comps = comps - first_comps;
+   const unsigned first_mask   = (mask << comp_shift) & 0xF;
+   const unsigned second_mask  = (mask >> (4 - comp_shift)) & 0xF;
+
+   if (first_mask > 0) {
+      emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
+                                   urb_handle, 0, comp_shift, first_comps,
+                                   first_mask);
+   }
+
+   if (second_mask > 0) {
+      emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords + 4, src,
+                                   urb_handle, first_comps, 0, second_comps,
+                                   second_mask);
+   }
+}
+
 static void
 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
                          const fs_reg &src, const fs_reg &offset_src,
@@ -1184,15 +1260,30 @@ fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *ins
    ubld8.MOV(h, urb_handle);
    ubld8.AND(h, h, brw_imm_ud(0xFFFF));
 
-   /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
-    * the non-array-index offset, we could use to decide if we can perform
-    * either one or (at most) two writes instead one per component.
-    */
-
-   if (nir_src_is_const(*offset_nir_src))
+   if (nir_src_is_const(*offset_nir_src)) {
       emit_urb_direct_writes(bld, instr, src, h);
-   else
-      emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h);
+   } else {
+      bool use_mod = false;
+      unsigned mod;
+
+      if (offset_nir_src->is_ssa) {
+         /* Try to calculate the value of (offset + base) % 4. If we can do
+          * this, then we can do indirect writes using only up to 2 URB
+          * writes (1 if modulo + num_comps is <= 4).
+          */
+         use_mod = nir_mod_analysis(nir_get_ssa_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
+         if (use_mod) {
+            mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
+            mod %= 4;
+         }
+      }
+
+      if (use_mod) {
+         emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(*offset_nir_src), h, mod);
+      } else {
+         emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src), h);
+      }
+   }
 }
 
 void