diff --git a/src/intel/compiler/brw_rt.h b/src/intel/compiler/brw_rt.h
index d03187636f6..15c024072f1 100644
--- a/src/intel/compiler/brw_rt.h
+++ b/src/intel/compiler/brw_rt.h
@@ -230,6 +230,18 @@ brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
    assert(size % 64 == 0);
    layout->sw_stack_start = size;
    layout->sw_stack_size = ALIGN(sw_stack_size, 64);
+
+   /* Currently it's always the case that sw_stack_size is a power of
+    * two, but power-of-two SW stack sizes are prone to causing
+    * collisions in the hashing function used by the L3 to map memory
+    * addresses to banks, which can cause stack accesses from most
+    * DSSes to bottleneck on a single L3 bank.  Fix it by padding the
+    * SW stack by a single cacheline if it was a power of two.
+    */
+   if (layout->sw_stack_size > 64 &&
+       util_is_power_of_two_nonzero(layout->sw_stack_size))
+      layout->sw_stack_size += 64;
+
    size += num_stack_ids * layout->sw_stack_size;
 
    layout->total_size = size;