diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
index 18325d68824..e5790183e98 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt
@@ -14,11 +14,6 @@ KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
 # "*** Color comparison failed"
 KHR-GLES3.packed_depth_stencil.verify_read_pixels.depth24_stencil8,Fail
 
-# "MESA: error: ir3_ra() failed!"
-KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing5,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing6,Fail
-
 # "The values of resultStd[i] & 0xFFFFFFFE and resultFma[i] & 0xFFFFFFFE and resultCPU[i] & 0xFFFFFFFE are not bitwise equal for i = 0..99 "
 KHR-GLES31.core.gpu_shader5.fma_precision_float,Fail
 KHR-GLES31.core.gpu_shader5.fma_precision_vec2,Fail
@@ -86,11 +81,6 @@ dEQP-VK.api.info.get_physical_device_properties2.properties,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device,Fail
 dEQP-VK.api.object_management.alloc_callback_fail.device_group,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite,Fail
-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store,Fail
-
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019
 # should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745
 dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
@@ -98,10 +88,6 @@ dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
 dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
 dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
 
-# "MESA: error: ir3_ra() failed!
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array,Fail
-
 # "deqp-vk: ../src/freedreno/vulkan/tu_cs.h:186: tu_cs_reserve: Assertion `tu_cs_get_space(cs) >= reserved_size' failed."
 # https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8841
 dEQP-VK.spirv_assembly.instruction.compute.opphi.wide,Crash
@@ -120,14 +106,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_si
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-# Needs spilling, or maybe some scheduling (though throwing a bit of nir_move/sink
-# at it didn't help).
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_inner_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_outer_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_strides,Fail
-
 dEQP-VK.texture.filtering.2d.formats.d24_unorm_s8_uint_stencil.nearest,Fail
 dEQP-VK.texture.filtering.2d_array.formats.d24_unorm_s8_uint_stencil.d24_unorm_s8_uint_stencil_nearest,Fail
 dEQP-VK.texture.filtering.cube.formats.d24_unorm_s8_uint_stencil.nearest,Fail
@@ -136,205 +114,6 @@ dEQP-VK.texture.filtering.unnormal.formats.d24_unorm_s8_uint_stencil.nearest,Fai
 # Broken on all drivers: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4582
 dEQP-VK.wsi.display_control.register_device_event,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.random.all_shared_buffer.5,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.0,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.17,Fail
-dEQP-VK.ssbo.layout.random.scalar.19,Fail
-
 bypass-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
 bypass-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
 bypass-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
diff --git a/src/freedreno/ci/deqp-freedreno-a630-skips.txt b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
index b3531a2c6ac..d95dac8a996 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-skips.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
@@ -25,3 +25,8 @@ dEQP-VK.ubo.random.all_shared_buffer.48
 
 # Still running after 3 hours, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
+
+# causes a hangcheck timeout on a630:
+# msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A630: hangcheck detected gpu lockup rb 0!
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
index 4a0fb40bbd1..ee4524797bd 100644
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -348,6 +348,9 @@ static const struct opc_info {
    OPC(6, OPC_GETSPID,      getspid),
    OPC(6, OPC_GETWID,       getwid),
 
+   OPC(6, OPC_SPILL_MACRO,  spill.macro),
+   OPC(6, OPC_RELOAD_MACRO, reload.macro),
+
    OPC(7, OPC_BAR,          bar),
    OPC(7, OPC_FENCE,        fence),
 /* clang-format on */
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
index 93162c103b4..f367d6a197d 100644
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -308,6 +308,9 @@ typedef enum {
    OPC_LDG_A           = _OPC(6, 55),
    OPC_STG_A           = _OPC(6, 56),
 
+   OPC_SPILL_MACRO     = _OPC(6, 57),
+   OPC_RELOAD_MACRO    = _OPC(6, 58),
+
    /* category 7: */
    OPC_BAR             = _OPC(7, 0),
    OPC_FENCE           = _OPC(7, 1),
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 187f8e13d3e..e0c678b0534 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -89,6 +89,7 @@ struct ir3_merge_set {
    uint16_t alignment;
 
    unsigned interval_start;
+   unsigned spill_slot;
 
    unsigned regs_count;
    struct ir3_register **regs;
@@ -202,6 +203,8 @@ struct ir3_register {
     */
    struct ir3_register *tied;
 
+   unsigned spill_slot, next_use;
+
    unsigned merge_set_offset;
    struct ir3_merge_set *merge_set;
    unsigned interval_start, interval_end;
@@ -711,6 +714,17 @@ ir3_instr_move_after(struct ir3_instruction *instr,
    list_add(&instr->node, &before->node);
 }
 
+/**
+ * Move 'instr' to the beginning of the block:
+ */
+static inline void
+ir3_instr_move_before_block(struct ir3_instruction *instr,
+                            struct ir3_block *block)
+{
+   list_delinit(&instr->node);
+   list_add(&instr->node, &block->instr_list);
+}
+
 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
 
 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 29a59052f99..cf407e6b2b3 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -44,6 +44,7 @@ static const struct debug_named_value shader_debug_options[] = {
    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
+   {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
 #ifdef DEBUG
    /* DEBUG-only options: */
    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index cf2fe2ad221..afe6113b1e4 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -194,6 +194,7 @@ enum ir3_shader_debug {
    IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
    IR3_DBG_NOFP16 = BITFIELD_BIT(10),
    IR3_DBG_NOCACHE = BITFIELD_BIT(11),
+   IR3_DBG_SPILLALL = BITFIELD_BIT(12),
 
    /* DEBUG-only options: */
    IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
diff --git a/src/freedreno/ir3/ir3_lower_spill.c b/src/freedreno/ir3/ir3_lower_spill.c
new file mode 100644
index 00000000000..265207105e9
--- /dev/null
+++ b/src/freedreno/ir3/ir3_lower_spill.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+
+/* The spilling pass leaves out a few details required to successfully operate
+ * ldp/stp:
+ *
+ * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores
+ *    that and just spills/restores entire values, including arrays and values
+ *    created for texture setup which can be more than 4 components.
+ * 2. The spiller doesn't add barrier dependencies needed for post-RA
+ *    scheduling.
+ *
+ * The first one, in particular, is much easier to handle after RA because
+ * arrays and normal values can be treated the same way. Therefore this pass
+ * runs after RA, and handles both issues. This keeps the complexity out of the
+ * spiller.
+ */
+
+static void
+split_spill(struct ir3_instruction *spill)
+{
+   unsigned orig_components = spill->srcs[2]->uim_val;
+
+   /* We don't handle splitting dependencies. */
+   assert(spill->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (spill->srcs[1]->flags & IR3_REG_ARRAY) {
+         spill->srcs[1]->wrmask = MASK(orig_components);
+         spill->srcs[1]->num = spill->srcs[1]->array.base;
+         spill->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(spill);
+      ir3_instr_move_before(clone, spill);
+
+      clone->srcs[1]->wrmask = MASK(components);
+      if (clone->srcs[1]->flags & IR3_REG_ARRAY) {
+         clone->srcs[1]->num = clone->srcs[1]->array.base + comp;
+         clone->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->cat6.dst_offset +=
+         comp * ((spill->srcs[1]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&spill->node);
+}
+
+static void
+split_reload(struct ir3_instruction *reload)
+{
+   unsigned orig_components = reload->srcs[2]->uim_val;
+
+   assert(reload->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (reload->dsts[0]->flags & IR3_REG_ARRAY) {
+         reload->dsts[0]->wrmask = MASK(orig_components);
+         reload->dsts[0]->num = reload->dsts[0]->array.base;
+         reload->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(reload);
+      ir3_instr_move_before(clone, reload);
+
+      clone->dsts[0]->wrmask = MASK(components);
+      if (clone->dsts[0]->flags & IR3_REG_ARRAY) {
+         clone->dsts[0]->num = clone->dsts[0]->array.base + comp;
+         clone->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->srcs[1]->uim_val +=
+         comp * ((reload->dsts[0]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&reload->node);
+}
+
+static void
+add_spill_reload_deps(struct ir3_block *block)
+{
+   struct ir3_instruction *last_spill = NULL;
+
+   foreach_instr (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(instr, last_spill);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+
+
+   last_spill = NULL;
+
+   foreach_instr_rev (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(last_spill, instr);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+}
+
+bool
+ir3_lower_spill(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            split_spill(instr);
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            split_reload(instr);
+      }
+
+      add_spill_reload_deps(block);
+
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            instr->opc = OPC_STP;
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            instr->opc = OPC_LDP;
+      }
+   }
+
+   return true;
+}
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c
index bb88dbe8fc7..674bc648e03 100644
--- a/src/freedreno/ir3/ir3_merge_regs.c
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -198,6 +198,7 @@ get_merge_set(struct ir3_register *def)
    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
    set->preferred_reg = ~0;
    set->interval_start = ~0;
+   set->spill_slot = ~0;
    set->size = reg_size(def);
    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
    set->regs_count = 1;
@@ -339,6 +340,19 @@ try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
       merge_merge_sets(a_set, b_set, b_set_offset);
 }
 
+void
+ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
+{
+   struct ir3_merge_set *a_set = get_merge_set(a);
+   struct ir3_merge_set *b_set = get_merge_set(b);
+
+   if (a_set == b_set)
+      return;
+
+   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
+   merge_merge_sets(a_set, b_set, b_set_offset);
+}
+
 static void
 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
 {
@@ -462,7 +476,7 @@ ir3_create_parallel_copies(struct ir3 *ir)
 }
 
 static void
-index_merge_sets(struct ir3 *ir)
+index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
 {
    unsigned offset = 0;
    foreach_block (block, &ir->block_list) {
@@ -489,6 +503,8 @@ index_merge_sets(struct ir3 *ir)
          }
       }
    }
+
+   live->interval_offset = offset;
 }
 
 #define RESET      "\x1b[0m"
@@ -559,7 +575,7 @@ ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
       }
    }
 
-   index_merge_sets(ir);
+   index_merge_sets(live, ir);
 
    if (ir3_shader_debug & IR3_DBG_RAMSGS)
       dump_merge_sets(ir);
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 9c320e0ed90..6870769f74d 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -1990,6 +1990,152 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
    return (target - 1) * 2 * 4;
 }
 
+static void
+add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
+             bool merged_regs)
+{
+   unsigned size = reg_size(reg);
+   if (reg->flags & IR3_REG_HALF)
+      pressure->half += size;
+   if (!(reg->flags & IR3_REG_HALF) || merged_regs)
+      pressure->full += size;
+}
+
+static void
+dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
+                     struct ir3_reg_interval *child)
+{
+}
+
+/* Calculate the minimum possible limit on register pressure so that spilling
+ * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
+ */
+
+static void
+calc_min_limit_pressure(struct ir3_shader_variant *v,
+                        struct ir3_liveness *live,
+                        struct ir3_pressure *limit)
+{
+   struct ir3_block *start = ir3_start_block(v->ir);
+   struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
+   struct ir3_reg_interval *intervals =
+      rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
+
+   ctx->interval_add = dummy_interval_add;
+   ctx->interval_delete = dummy_interval_delete;
+   ctx->interval_readd = dummy_interval_readd;
+
+   limit->full = limit->half = 0;
+
+   struct ir3_pressure cur_pressure = {0};
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
+   }
+
+   limit->full = MAX2(limit->full, cur_pressure.full);
+   limit->half = MAX2(limit->half, cur_pressure.half);
+
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      /* pre-colored inputs may have holes, which increases the pressure. */
+      struct ir3_register *dst = input->dsts[0];
+      if (dst->num != INVALID_REG) {
+         unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
+         if (dst->flags & IR3_REG_HALF)
+            limit->half = MAX2(limit->half, physreg);
+         if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
+            limit->full = MAX2(limit->full, physreg);
+      }
+   }
+
+   foreach_block (block, &v->ir->block_list) {
+      rb_tree_init(&ctx->intervals);
+
+      unsigned name;
+      BITSET_FOREACH_SET (name, live->live_in[block->index],
+                          live->definitions_count) {
+         struct ir3_register *reg = live->definitions[name];
+         ir3_reg_interval_init(&intervals[reg->name], reg);
+         ir3_reg_interval_insert(ctx, &intervals[reg->name]);
+      }
+
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+         }
+         /* phis and parallel copies can be deleted via spilling */
+
+         if (instr->opc == OPC_META_PHI) {
+            ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
+            continue;
+         }
+
+         if (instr->opc == OPC_META_PARALLEL_COPY)
+            continue;
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
+               add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         ra_foreach_src_rev (src, instr) {
+            /* We currently don't support spilling the parent of a source when
+             * making space for sources, so we have to keep track of the
+             * intervals and figure out the root of the tree to figure out how
+             * much space we need.
+             *
+             * TODO: We should probably support this in the spiller.
+             */
+            struct ir3_reg_interval *interval = &intervals[src->def->name];
+            while (interval->parent)
+               interval = interval->parent;
+            add_pressure(&cur_pressure, interval->reg, v->mergedregs);
+
+            if (src->flags & IR3_REG_FIRST_KILL)
+               ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+            ir3_reg_interval_insert(ctx, &intervals[dst->name]);
+            add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+      }
+   }
+
+   /* Account for the base register, which needs to be available everywhere. */
+   limit->full += 2;
+
+   ralloc_free(ctx);
+}
+
 int
 ir3_ra(struct ir3_shader_variant *v)
 {
@@ -2010,15 +2156,35 @@ ir3_ra(struct ir3_shader_variant *v)
    d("\thalf: %u", max_pressure.half);
    d("\tshared: %u", max_pressure.shared);
 
-   if (v->mergedregs) {
-      max_pressure.full += max_pressure.half;
-      max_pressure.half = 0;
+   /* TODO: calculate half/full limit correctly for CS with barrier */
+   struct ir3_pressure limit_pressure;
+   limit_pressure.full = RA_FULL_SIZE;
+   limit_pressure.half = RA_HALF_SIZE;
+   limit_pressure.shared = RA_SHARED_SIZE;
+
+   /* If requested, lower the limit so that spilling happens more often. */
+   if (ir3_shader_debug & IR3_DBG_SPILLALL)
+      calc_min_limit_pressure(v, live, &limit_pressure);
+
+   if (max_pressure.shared > limit_pressure.shared) {
+      /* TODO shared reg -> normal reg spilling */
+      d("shared max pressure exceeded!");
+      return 1;
    }
 
-   if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
-       max_pressure.shared > RA_SHARED_SIZE) {
-      d("max pressure exceeded!");
-      return 1;
+   bool spilled = false;
+   if (max_pressure.full > limit_pressure.full ||
+       max_pressure.half > limit_pressure.half) {
+      if (!v->shader->compiler->has_pvtmem) {
+         d("max pressure exceeded!");
+         return 1;
+      }
+      d("max pressure exceeded, spilling!");
+      IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
+      ir3_calc_pressure(v, live, &max_pressure);
+      assert(max_pressure.full <= limit_pressure.full &&
+             max_pressure.half <= limit_pressure.half);
+      spilled = true;
    }
 
    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
@@ -2054,19 +2220,20 @@ ir3_ra(struct ir3_shader_variant *v)
          for (unsigned i = 0; i < instr->dsts_count; i++) {
             instr->dsts[i]->flags &= ~IR3_REG_SSA;
 
-            /* Parallel copies of array registers copy the whole register,
-             * and we need some way to let the parallel copy code know
-             * that this was an array whose size is determined by
-             * reg->size. So keep the array flag on those.
+            /* Parallel copies of array registers copy the whole register, and
+             * we need some way to let the parallel copy code know that this was
+             * an array whose size is determined by reg->size. So keep the array
+             * flag on those. spill/reload also need to work on the entire
+             * array.
              */
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
          }
 
          for (unsigned i = 0; i < instr->srcs_count; i++) {
             instr->srcs[i]->flags &= ~IR3_REG_SSA;
 
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
          }
       }
@@ -2074,6 +2241,10 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_debug_print(v->ir, "AFTER: register allocation");
 
+   if (spilled) {
+      IR3_PASS(v->ir, ir3_lower_spill);
+   }
+
    ir3_lower_copies(v);
 
    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index fcef6a908e1..4a7c9d7752a 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -137,6 +137,7 @@ ra_reg_is_dst(const struct ir3_register *reg)
 
 struct ir3_liveness {
    unsigned block_count;
+   unsigned interval_offset;
    DECLARE_ARRAY(struct ir3_register *, definitions);
    DECLARE_ARRAY(BITSET_WORD *, live_out);
    DECLARE_ARRAY(BITSET_WORD *, live_in);
@@ -151,6 +152,9 @@ void ir3_create_parallel_copies(struct ir3 *ir);
 
 void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
 
+void ir3_force_merge(struct ir3_register *a, struct ir3_register *b,
+                     int b_offset);
+
 struct ir3_pressure {
    unsigned full, half, shared;
 };
@@ -158,6 +162,12 @@ struct ir3_pressure {
 void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                        struct ir3_pressure *max_pressure);
 
+bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+               struct ir3_liveness **live,
+               const struct ir3_pressure *limit_pressure);
+
+bool ir3_lower_spill(struct ir3 *ir);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
                      unsigned half_size, unsigned block_count);
 
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c
index 54ae56b1beb..0b9c56a7680 100644
--- a/src/freedreno/ir3/ir3_spill.c
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -26,62 +26,318 @@
 #include "ir3_shader.h"
 
 /*
- * This pass does one thing so far:
+ * This pass does two things:
  *
  * 1. Calculates the maximum register pressure. To do this, we need to use the
- * exact same technique that RA uses for combining meta_split instructions
- * with their sources, so that our calculation agrees with RA.
- *
- * It will also optionally spill registers once that's implemented.
+ *    exact same technique that RA uses for combining meta_split instructions
+ *    with their sources, so that our calculation agrees with RA.
+ * 2. Spills when the register pressure is exceeded a limit calculated by RA.
+ *    The implementation is based on "Register Spilling and Live-Range Splitting
+ *    for SSA-Form Programs" by Braun and Hack, although again care has to be
+ *    taken to handle combining split/collect instructions.
  */
 
+struct reg_or_immed {
+   unsigned flags;
+   union {
+      struct ir3_register *def;
+      uint32_t uimm;
+      unsigned const_num;
+   };
+};
+
 struct ra_spill_interval {
    struct ir3_reg_interval interval;
+
+   struct rb_node node;
+   struct rb_node half_node;
+
+   /* The current SSA value/const/immed this source is mapped to. */
+   struct reg_or_immed dst;
+
+   /* When computing use distances we use the distance relative to the start
+    * of the block. So, for example, a value that's defined in cycle 5 of the
+    * block and used 6 cycles later will always have a next_use_distance of 11
+    * until we reach that use.
+    */
+   unsigned next_use_distance;
+
+   /* Whether this value was reloaded and therefore doesn't need to be
+    * spilled again. Corresponds to the S set in the paper.
+    */
+   bool already_spilled;
+
+   /* We need to add sources early for accounting purposes, but we have to
+    * insert the reload code for them last. Keep track of whether this interval
+    * needs to be reloaded later.
+    */
+   bool needs_reload;
+
+   /* Keep track of whether this interval currently can't be spilled because:
+    * - It or one of its children is a source and we're making space for
+    *   sources.
+    * - It is a destination and we're making space for destinations.
+    */
+   bool cant_spill;
+};
+
+struct ra_spill_block_state {
+   unsigned *next_use_end;
+   unsigned *next_use_start;
+
+   unsigned cycles;
+
+   /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
+    * This map only contains values which we didn't spill, so it also serves as
+    * a record of the new live-out set for this block.
+    */
+   struct hash_table *remap;
+
+   /* For blocks whose successors are visited first (i.e. loop backedges), which
+    * values should be live at the end.
+    */
+   BITSET_WORD *live_out;
+
+   bool visited;
 };
 
 struct ra_spill_ctx {
    struct ir3_reg_ctx reg_ctx;
 
-   struct ra_spill_interval *intervals;
+   struct ra_spill_interval **intervals;
+   unsigned intervals_count;
+
+   /* rb tree of live intervals that we can spill, ordered by next-use distance.
+    * full_live_intervals contains the full+shared intervals in the merged_regs
+    * case. We use this list to determine what to spill.
+    */
+   struct rb_tree full_live_intervals;
+   struct rb_tree half_live_intervals;
 
    struct ir3_pressure cur_pressure, max_pressure;
 
+   struct ir3_pressure limit_pressure;
+
+   /* When spilling, we need to reserve a register to serve as the zero'd
+    * "base". For simplicity we reserve a register at the beginning so that it's
+    * always available.
+    */
+   struct ir3_register *base_reg;
+
+   /* Current pvtmem offset in bytes. */
+   unsigned spill_slot;
+
    struct ir3_liveness *live;
 
    const struct ir3_compiler *compiler;
+
+   struct ra_spill_block_state *blocks;
+
+   bool spilling;
+
+   bool merged_regs;
 };
 
+static void
+add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   struct ir3_block *start = ir3_start_block(ir);
+
+   /* We need to stick it after any meta instructions which need to be first. */
+   struct ir3_instruction *after = NULL;
+   foreach_instr (instr, &start->instr_list) {
+      if (instr->opc != OPC_META_INPUT &&
+          instr->opc != OPC_META_TEX_PREFETCH) {
+         after = instr;
+         break;
+      }
+   }
+
+   struct ir3_instruction *mov = create_immed(start, 0);
+
+   if (after)
+      ir3_instr_move_before(mov, after);
+
+   ctx->base_reg = mov->dsts[0];
+
+   /* We don't create an interval, etc. for the base reg, so just lower the
+    * register pressure limit to account for it. We assume it's always
+    * available for simplicity.
+    */
+   ctx->limit_pressure.full -= reg_size(ctx->base_reg);
+}
+
+
+/* Compute the number of cycles per instruction used for next-use-distance
+ * analysis. This is just approximate, obviously.
+ */
+static unsigned
+instr_cycles(struct ir3_instruction *instr)
+{
+   if (instr->opc == OPC_META_PARALLEL_COPY) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->dsts_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
+            cycles += reg_elems(instr->srcs[i]);
+         }
+      }
+
+      return cycles;
+   }
+
+   if (instr->opc == OPC_META_COLLECT) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->srcs_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
+            cycles++;
+         }
+      }
+
+      return cycles;
+   }
+
+   if (is_meta(instr))
+      return 0;
+
+   return 1 + instr->repeat;
+}
+
+static bool
+compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
+                            unsigned *tmp_next_use)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   memcpy(tmp_next_use, state->next_use_end,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   unsigned cycle = state->cycles;
+   foreach_instr_rev (instr, &block->instr_list) {
+      ra_foreach_dst (dst, instr) {
+         dst->next_use = tmp_next_use[dst->name];
+      }
+
+      ra_foreach_src (src, instr) {
+         src->next_use = tmp_next_use[src->def->name];
+      }
+
+      cycle -= instr_cycles(instr);
+
+      if (instr->opc == OPC_META_PARALLEL_COPY) {
+         ra_foreach_src_n (src, i, instr) {
+            if (src->def->merge_set == instr->dsts[i]->merge_set &&
+                src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
+               tmp_next_use[src->def->name] =
+                  tmp_next_use[instr->dsts[i]->name];
+            } else {
+               tmp_next_use[src->def->name] = cycle;
+            }
+         }
+      } else if (instr->opc != OPC_META_PHI) {
+         ra_foreach_src (src, instr) {
+            tmp_next_use[src->def->name] = cycle;
+         }
+      }
+
+      ra_foreach_dst (dst, instr) {
+         tmp_next_use[dst->name] = UINT_MAX;
+      }
+   }
+
+   memcpy(state->next_use_start, tmp_next_use,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   bool progress = false;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      const struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
+
+      /* Add a large-enough distance in front of edges exiting the loop so that
+       * variables that are live-through the loop but not used inside it are
+       * prioritized for spilling, as per the paper. This just needs to be
+       * larger than the longest path through the loop.
+       */
+      bool loop_exit = pred->loop_depth < block->loop_depth;
+      unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         if (state->next_use_start[j] < UINT_MAX &&
+             state->next_use_start[j] + block_distance <
+             pred_state->next_use_end[j]) {
+            pred_state->next_use_end[j] = state->next_use_start[j] +
+               block_distance;
+            progress = true;
+         }
+      }
+
+      foreach_instr (phi, &block->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if (!phi->srcs[i]->def)
+            continue;
+         unsigned src = phi->srcs[i]->def->name;
+         if (phi->dsts[0]->next_use < UINT_MAX &&
+             phi->dsts[0]->next_use + block_distance <
+             pred_state->next_use_end[src]) {
+            pred_state->next_use_end[src] = phi->dsts[0]->next_use +
+               block_distance;
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   for (unsigned i = 0; i < ctx->live->block_count; i++) {
+      ctx->blocks[i].next_use_start =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+      ctx->blocks[i].next_use_end =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         ctx->blocks[i].next_use_start[j] = UINT_MAX;
+         ctx->blocks[i].next_use_end[j] = UINT_MAX;
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      struct ra_spill_block_state *state = &ctx->blocks[block->index];
+      state->cycles = 0;
+      foreach_instr (instr, &block->instr_list) {
+         state->cycles += instr_cycles(instr);
+         foreach_dst (dst, instr) {
+            dst->spill_slot = ~0;
+         }
+      }
+   }
+
+   unsigned *tmp_next_use =
+      ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+      foreach_block_rev (block, &ir->block_list) {
+         progress |= compute_block_next_distance(ctx, block, tmp_next_use);
+      }
+   }
+}
+
 static void
 ra_spill_interval_init(struct ra_spill_interval *interval,
                        struct ir3_register *reg)
 {
    ir3_reg_interval_init(&interval->interval, reg);
-}
-
-static void
-ra_pressure_add(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared += size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half += size;
-   else
-      pressure->full += size;
-}
-
-static void
-ra_pressure_sub(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared -= size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half -= size;
-   else
-      pressure->full -= size;
+   interval->dst.flags = reg->flags;
+   interval->dst.def = reg;
+   interval->already_spilled = false;
+   interval->needs_reload = false;
+   interval->cant_spill = false;
 }
 
 static struct ra_spill_interval *
@@ -90,19 +346,66 @@ ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
    return rb_node_data(struct ra_spill_interval, interval, interval);
 }
 
+static struct ra_spill_interval *
+ra_spill_interval_root(struct ra_spill_interval *interval)
+{
+   struct ir3_reg_interval *ir3_interval = &interval->interval;
+   while (ir3_interval->parent)
+      ir3_interval = ir3_interval->parent;
+   return ir3_reg_interval_to_interval(ir3_interval);
+}
+
 static struct ra_spill_ctx *
 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
 {
    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
 }
 
+static int
+ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
+static int
+ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, half_node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, half_node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
 static void
 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
 {
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_add(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared += size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                           ra_spill_interval_half_cmp);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                           ra_spill_interval_cmp);
+         }
+      }
+   }
 }
 
 static void
@@ -111,7 +414,23 @@ interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_sub(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared -= size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         }
+      }
+   }
 }
 
 static void
@@ -122,8 +441,22 @@ interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
 }
 
 static void
-spill_ctx_init(struct ra_spill_ctx *ctx)
+spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
+               struct ir3_liveness *live)
 {
+   ctx->live = live;
+   ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
+                                 ctx->live->definitions_count);
+   struct ra_spill_interval *intervals =
+      rzalloc_array(ctx, struct ra_spill_interval,
+                    ctx->live->definitions_count);
+   for (unsigned i = 0; i < ctx->live->definitions_count; i++)
+      ctx->intervals[i] = &intervals[i];
+
+   ctx->intervals_count = ctx->live->definitions_count;
+   ctx->compiler = v->shader->compiler;
+   ctx->merged_regs = v->mergedregs;
+
    rb_tree_init(&ctx->reg_ctx.intervals);
    ctx->reg_ctx.interval_add = interval_add;
    ctx->reg_ctx.interval_delete = interval_delete;
@@ -147,18 +480,21 @@ ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
 static void
 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    ra_spill_interval_init(interval, dst);
+   if (ctx->spilling)
+      interval->next_use_distance = dst->next_use;
 }
 
 static void
 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    if (interval->interval.inserted)
       return;
 
    ra_spill_ctx_insert(ctx, interval);
+   interval->cant_spill = true;
 
    /* For precolored inputs, make sure we leave enough registers to allow for
     * holes in the inputs. It can happen that the binning shader has a lower
@@ -179,14 +515,26 @@ insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    }
 }
 
+static void
+insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   ra_spill_interval_root(interval)->cant_spill = true;
+
+   if (interval->interval.inserted)
+      return;
+
+   ra_spill_ctx_insert(ctx, interval);
+   interval->needs_reload = true;
+   interval->already_spilled = true;
+}
+
 static void
 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
                  struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted || interval->interval.parent ||
        !rb_tree_is_empty(&interval->interval.children))
@@ -199,10 +547,7 @@ static void
 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
            struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted)
       return;
@@ -210,10 +555,17 @@ remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
+   interval->cant_spill = false;
+}
+
 static void
 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
 
    if (!interval->interval.inserted)
       return;
@@ -221,6 +573,361 @@ remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   assert(interval->interval.inserted);
+
+   interval->next_use_distance = src->next_use;
+
+   /* If this node is inserted in one of the trees, then it needs to be resorted
+    * as its key has changed.
+    */
+   if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
+      if (src->flags & IR3_REG_HALF) {
+         rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                        ra_spill_interval_half_cmp);
+      }
+      if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
+         rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                        ra_spill_interval_cmp);
+      }
+   }
+}
+
+static unsigned
+get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
+{
+   if (reg->merge_set) {
+      if (reg->merge_set->spill_slot == ~0) {
+         reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
+                                                reg->merge_set->alignment);
+         ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
+      }
+      return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
+   } else {
+      if (reg->spill_slot == ~0) {
+         reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg));
+         ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
+      }
+      return reg->spill_slot;
+   }
+}
+
+static void
+set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
+{
+   if (val->flags & IR3_REG_IMMED) {
+      src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
+      src->uim_val = val->uimm;
+      src->def = NULL;
+   } else if (val->flags & IR3_REG_CONST) {
+      src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
+      src->num = val->const_num;
+      src->def = NULL;
+   } else {
+      src->def = val->def;
+   }
+}
+
+static struct ir3_register *
+materialize_pcopy_src(const struct reg_or_immed *src,
+                      struct ir3_instruction *instr,
+                      struct ir3_block *block)
+{
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   struct ir3_register *dst = __ssa_dst(mov);
+   dst->flags |= src->flags & IR3_REG_HALF;
+   struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
+   set_src_val(mov_src, src);
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (instr)
+      ir3_instr_move_before(mov, instr);
+   return dst;
+}
+
+static void
+spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
+      unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block)
+{
+   struct ir3_register *reg;
+
+   /* If spilling an immed/const pcopy src, we need to actually materialize it
+    * first with a mov.
+    */
+   if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+      reg = materialize_pcopy_src(val, instr, block);
+   } else {
+      reg = val->def;
+   }
+
+   d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *spill =
+      ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3);
+   ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
+                                      IR3_REG_CONST | IR3_REG_SSA |
+                                      IR3_REG_ARRAY);
+   struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
+   ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   spill->cat6.dst_offset = spill_slot;
+   spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   
+   src->def = reg;
+   if (reg->flags & IR3_REG_ARRAY) {
+      src->size = reg->size;
+      src->array.id = reg->array.id;
+      src->array.offset = 0;
+   } else {
+      src->wrmask = reg->wrmask;
+   }
+
+   if (instr)
+      ir3_instr_move_before(spill, instr);
+}
+
+static void
+spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_instruction *instr, struct ir3_block *block)
+{
+   spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
+         instr, block);
+}
+
+/* This is similar to "limit" in the paper. */
+static void
+limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
+        ctx->limit_pressure.half);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+               break;
+         }
+      }
+
+      assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
+        ctx->limit_pressure.full);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+               break;
+         } else {
+            d("can't spill");
+         }
+      }
+
+      assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
+   }
+}
+
+/* There's a corner case where we reload a value which has overlapping live
+ * values already reloaded, either because it's the child of some other interval
+ * that was already reloaded or some of its children have already been
+ * reloaded. Because RA only expects overlapping source/dest intervals for meta
+ * instructions (split/collect), and we don't want to add register pressure by
+ * creating an entirely separate value, we need to add splits and collects to
+ * deal with this case. These splits/collects have to also have correct merge
+ * set information, so that it doesn't result in any actual code or register
+ * pressure in practice.
+ */
+
+static void
+add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
+                 unsigned offset)
+{
+   def->merge_set = set;
+   def->merge_set_offset = offset;
+   def->interval_start = set->interval_start + offset;
+   def->interval_end = set->interval_start + offset + reg_size(def);
+}
+
+static struct ir3_register *
+split(struct ir3_register *def, unsigned offset,
+      struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (reg_elems(def) == 1) {
+      assert(offset == 0);
+      return def;
+   }
+
+   assert(!(def->flags & IR3_REG_ARRAY));
+   assert(def->merge_set);
+   struct ir3_instruction *split =
+      ir3_instr_create(after->block, OPC_META_SPLIT, 1, 1);
+   struct ir3_register *dst = __ssa_dst(split);
+   dst->flags |= def->flags & IR3_REG_HALF;
+   struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
+   src->wrmask = def->wrmask;
+   src->def = def;
+   add_to_merge_set(def->merge_set, dst,
+                    def->merge_set_offset + offset * reg_elem_size(def));
+   if (after)
+      ir3_instr_move_before(split, after);
+   return dst;
+}
+
+static struct ir3_register *
+extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
+        struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (offset == 0 && elems == reg_elems(parent_def))
+      return parent_def;
+
+   struct ir3_instruction *collect =
+      ir3_instr_create(after->block, OPC_META_COLLECT, 1, elems);
+   struct ir3_register *dst = __ssa_dst(collect);
+   dst->flags |= parent_def->flags & IR3_REG_HALF;
+   dst->wrmask = MASK(elems);
+   add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
+
+   for (unsigned i = 0; i < elems; i++) {
+      ir3_src_create(collect, INVALID_REG, parent_def->flags)->def =
+         split(parent_def, offset + i, after, block);
+   }
+
+   if (after)
+      ir3_instr_move_before(collect, after);
+   return dst;
+}
+
+static struct ir3_register *
+reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
+       struct ir3_instruction *after, struct ir3_block *block)
+{
+   unsigned spill_slot = get_spill_slot(ctx, reg);
+
+   d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *reload =
+      ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3);
+   struct ir3_register *dst = __ssa_dst(reload);
+   dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   struct ir3_register *offset_reg =
+      ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
+   offset_reg->uim_val = spill_slot;
+   ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (reg->flags & IR3_REG_ARRAY) {
+      dst->array.offset = 0;
+      dst->array.id = reg->array.id;
+      dst->size = reg->size;
+   } else {
+      dst->wrmask = MASK(elems);
+   }
+
+   dst->merge_set = reg->merge_set;
+   dst->merge_set_offset = reg->merge_set_offset;
+   dst->interval_start = reg->interval_start;
+   dst->interval_end = reg->interval_end;
+
+   if (after)
+      ir3_instr_move_before(reload, after);
+
+   return dst;
+}
+
+static void
+rewrite_src_interval(struct ra_spill_ctx *ctx,
+                    struct ra_spill_interval *interval,
+                    struct ir3_register *def,
+                    struct ir3_instruction *instr,
+                    struct ir3_block *block)
+{
+   interval->dst.flags = def->flags;
+   interval->dst.def = def;
+   interval->needs_reload = false;
+
+   rb_tree_foreach (struct ra_spill_interval, child, 
+                    &interval->interval.children, interval.node) {
+      struct ir3_register *child_reg = child->interval.reg;
+      struct ir3_register *child_def =
+         extract(def, (child_reg->interval_start -
+                       interval->interval.reg->interval_start) / reg_elem_size(def),
+                 reg_elems(child_reg), instr, block);
+      rewrite_src_interval(ctx, child, child_def, instr, block);
+   }
+}
+
+static void
+reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
+           struct ir3_instruction *instr, struct ir3_block *block)
+{
+   unsigned elems = reg_elems(def);
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+
+   struct ir3_reg_interval *ir3_parent = interval->interval.parent;
+
+   if (ir3_parent) {
+      struct ra_spill_interval *parent =
+         ir3_reg_interval_to_interval(ir3_parent);
+      if (!parent->needs_reload) {
+         interval->dst.flags = def->flags;
+         interval->dst.def = extract(
+            parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
+            reg_elem_size(def), elems, instr, block);
+         return;
+      }
+   }
+
+   struct ir3_register *dst = reload(ctx, def, instr, block);
+
+   rewrite_src_interval(ctx, interval, dst, instr, block);
+}
+
+static void
+reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   if (interval->needs_reload) {
+      reload_def(ctx, src->def, instr, instr->block);
+   }
+
+   ra_spill_interval_root(interval)->cant_spill = false;
+}
+
+static void
+rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   set_src_val(src, &interval->dst);
+}
+
 static void
 update_max_pressure(struct ra_spill_ctx *ctx)
 {
@@ -240,12 +947,15 @@ update_max_pressure(struct ra_spill_ctx *ctx)
 static void
 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   di(instr, "processing");
-
    ra_foreach_dst (dst, instr) {
       init_dst(ctx, dst);
    }
 
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr)
+         insert_src(ctx, src);
+   }
+
    /* Handle tied destinations. If a destination is tied to a source and that
     * source is live-through, then we need to allocate a new register for the
     * destination which is live-through itself and cannot overlap the
@@ -258,7 +968,17 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
          insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         reload_src(ctx, instr, src);
+         update_src_next_use(ctx, src);
+      }
+   }
 
    ra_foreach_src (src, instr) {
       if (src->flags & IR3_REG_FIRST_KILL)
@@ -269,13 +989,29 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
       insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
 
-   for (unsigned i = 0; i < instr->srcs_count; i++) {
-      if (ra_reg_is_src(instr->srcs[i]) &&
-          (instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
-         remove_src(ctx, instr, instr->srcs[i]);
+   /* We have to remove sources before rewriting them so that we can lookup the
+    * interval to remove before the source itself is changed.
+    */
+   ra_foreach_src (src, instr) {
+      if (src->flags & IR3_REG_FIRST_KILL)
+         remove_src(ctx, instr, src);
    }
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         rewrite_src(ctx, instr, src);
+      }
+   }
+
+   ra_foreach_dst (dst, instr) {
+      finish_dst(ctx, dst);
+   }
+
    for (unsigned i = 0; i < instr->dsts_count; i++) {
       if (ra_reg_is_dst(instr->dsts[i]) &&
           (instr->dsts[i]->flags & IR3_REG_UNUSED))
@@ -283,28 +1019,672 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
    }
 }
 
+static struct ra_spill_interval *
+create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
+{
+   unsigned name = ctx->intervals_count++;
+   unsigned offset = ctx->live->interval_offset;
+
+   /* This is kinda hacky, but we need to create a fake SSA def here that is
+    * only used as part of the pcopy accounting. See below.
+    */
+   struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
+   *reg = *def;
+   reg->name = name;
+   reg->interval_start = offset;
+   reg->interval_end = offset + reg_size(def);
+   reg->merge_set = NULL;
+
+   ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
+                             ctx->intervals_count); 
+   struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
+   ra_spill_interval_init(interval, reg);
+   ctx->intervals[name] = interval;
+   ctx->live->interval_offset += reg_size(def);
+   return interval;
+}
+
+/* In the sequence of copies generated (see below), would this source be killed?
+ */
+static bool
+is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
+{
+   struct ir3_register *src = pcopy->srcs[src_n];
+   if (!(src->flags & IR3_REG_KILL))
+      return false;
+   for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
+      if (pcopy->srcs[j]->def == src->def)
+         return false;
+   }
+   return true;
+}
+
+/* Parallel copies are different from normal instructions. The sources together
+ * may be larger than the entire register file, so we cannot just reload every
+ * source like normal, and indeed that probably wouldn't be a great idea.
+ * Instead we essentially need to lower the parallel copy to "copies," just like
+ * in the normal CSSA construction, although we implement the copies by
+ * reloading and then possibly spilling values. We essentially just shuffle
+ * around the sources until each source either (a) is live or (b) has the same
+ * spill slot as its corresponding destination. We do this by decomposing the
+ * copy into a series of copies, so:
+ *
+ * a, b, c = d, e, f
+ *
+ * becomes:
+ *
+ * d' = d
+ * e' = e
+ * f' = f
+ * a = d'
+ * b = e'
+ * c = f'
+ *
+ * the temporary SSA values d', e', and f' never actually show up in the result.
+ * They are only used for our internal accounting. They may, however, have their
+ * own spill slot created for them. Similarly, we don't actually emit any copy
+ * instructions, although we emit the spills/reloads that *would've* been
+ * required if those copies were there.
+ *
+ * TODO: in order to reduce the number of temporaries and therefore spill slots,
+ * we could instead do a more complicated analysis that considers the location
+ * transfer graph.
+ *
+ * In addition, we actually remove the parallel copy and rewrite all its uses
+ * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
+ * later turns out to be easier than keeping it up-to-date throughout this pass,
+ * since we may have to remove entries for phi sources that are spilled and add
+ * entries for live-outs that are spilled and reloaded, which can happen here
+ * and then possibly be undone or done again when processing live-ins of the
+ * successor block.
+ */
+
+static void
+handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
+{
+   foreach_dst (dst, pcopy) {
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+      ra_spill_interval_init(dst_interval, dst);
+   }
+
+   foreach_src_n (src, i, pcopy) {
+      d("processing src %u", i);
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      /* Skip the intermediate copy for cases where the source is merged with
+       * the destination. Crucially this means that we also don't reload/spill
+       * it if it's been spilled, because it shares the same spill slot.
+       */
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset) {
+         struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
+         struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+         if (src_interval->interval.inserted) {
+            update_src_next_use(ctx, src);
+            if (is_last_pcopy_src(pcopy, i))
+               ra_spill_ctx_remove(ctx, src_interval);
+            dst_interval->cant_spill = true;
+            ra_spill_ctx_insert(ctx, dst_interval);
+            limit(ctx, pcopy);
+            dst_interval->cant_spill = false;
+            dst_interval->dst = src_interval->dst;
+         }
+      } else if (src->def) {
+         struct ra_spill_interval *temp_interval =
+            create_temp_interval(ctx, dst);
+         struct ir3_register *temp = temp_interval->interval.reg;
+         temp_interval->next_use_distance = src->next_use;
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         update_src_next_use(ctx, src);
+         if (is_last_pcopy_src(pcopy, i))
+            remove_src(ctx, pcopy, src);
+         struct ra_spill_interval *src_interval =
+            ctx->intervals[src->def->name];
+         temp_interval->dst = src_interval->dst;
+
+         temp_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, temp_interval);
+         limit(ctx, pcopy);
+         temp_interval->cant_spill = false;
+
+         src->flags = temp->flags;
+         src->def = temp;
+      }
+   }
+
+   d("done with pcopy srcs");
+
+   foreach_src_n (src, i, pcopy) {
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset)
+         continue;
+
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+
+      if (!src->def) {
+         dst_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, dst_interval);
+         limit(ctx, pcopy);
+         dst_interval->cant_spill = false;
+
+         assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
+         if (src->flags & IR3_REG_CONST) {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.const_num = src->num;
+         } else {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.uimm = src->uim_val;
+         }
+      } else {
+         struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         remove_src(ctx, pcopy, src);
+
+         dst_interval->dst = temp_interval->dst;
+         ra_spill_ctx_insert(ctx, dst_interval);
+      }
+   }
+
+   pcopy->flags |= IR3_INSTR_UNUSED;
+}
+
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
    init_dst(ctx, instr->dsts[0]);
    insert_dst(ctx, instr->dsts[0]);
+   finish_dst(ctx, instr->dsts[0]);
 }
 
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   ra_foreach_src (src, instr)
-      remove_src(ctx, instr, src);
+   if (instr->opc == OPC_META_TEX_PREFETCH) {
+      ra_foreach_src (src, instr)
+         remove_src(ctx, instr, src);
+   }
    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
       remove_dst(ctx, instr->dsts[0]);
 }
 
 static void
-handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
+handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
+               struct ir3_register *def)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[def->name];
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
    ra_spill_interval_init(interval, def);
-   insert_dst(ctx, def);
+   if (ctx->spilling) {
+      interval->next_use_distance =
+         ctx->blocks[block->index].next_use_start[def->name];
+   }
+
+   ra_spill_ctx_insert(ctx, interval);
+}
+
+static bool
+is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
+{
+   return def->instr->opc == OPC_META_PHI && def->instr->block == block;
+}
+
+static bool
+is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return false;
+   }
+
+   return _mesa_hash_table_search(state->remap, def);
+}
+
+static bool
+is_live_in_undef(struct ir3_register *def,
+                 struct ir3_block *block, unsigned pred_idx)
+{
+   if (!is_live_in_phi(def, block))
+      return false;
+
+   return !def->instr->srcs[pred_idx]->def;
+}
+
+static struct reg_or_immed *
+read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+             struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return NULL;
+   }
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static bool
+is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                     struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (!is_live_in_pred(ctx, def, block, i))
+         return false;
+   }
+
+   return true;
+}
+
+static void
+spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+              struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
+      if (pred_def) {
+         spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred);
+      }
+   }
+}
+
+static void
+spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   bool all_preds_visited = true;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited) {
+         all_preds_visited = false;
+         break;
+      }
+   }
+
+   /* Note: in the paper they explicitly spill live-through values first, but we
+    * should be doing that automatically by virtue of picking the largest
+    * distance due to the extra distance added to edges out of loops.
+    *
+    * TODO: Keep track of pressure in each block and preemptively spill
+    * live-through values as described in the paper to avoid spilling them
+    * inside the loop.
+    */
+
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+            break;
+      }
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+            break;
+      }
+   }
+}
+
+static void
+live_in_rewrite(struct ra_spill_ctx *ctx,
+                struct ra_spill_interval *interval,
+                struct reg_or_immed *new_val,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+
+   if (def)
+      _mesa_hash_table_insert(state->remap, def, new_val);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      assert(new_val->flags & IR3_REG_SSA);
+      struct ir3_register *child_def =
+         extract(new_val->def,
+                 (child->interval.reg->interval_start - def->interval_start) /
+                 reg_elem_size(def), reg_elems(child->interval.reg),
+                 NULL, pred);
+      struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed);
+      child_val->def = child_def;
+      child_val->flags = child_def->flags;
+      live_in_rewrite(ctx, child, child_val, block, pred_idx);
+   }
+}
+
+static void
+reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+               struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited)
+         continue;
+
+      if (is_live_in_undef(def, block, i))
+         continue;
+
+      struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
+
+      if (!new_val) {
+         new_val = ralloc(ctx, struct reg_or_immed);
+         new_val->def = reload(ctx, def, NULL, pred);
+         new_val->flags = new_val->def->flags;
+      }
+      live_in_rewrite(ctx, interval, new_val, block, i);
+   }
+}
+
+static void
+reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      reload_live_in(ctx, interval->interval.reg, block);
+   }
+}
+
+static void
+add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   if (!interval->interval.inserted)
+      return;
+
+   bool needs_phi = false;
+   struct ir3_register *cur_def = NULL;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited) {
+         needs_phi = true;
+         break;
+      }
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, def);
+      assert(entry);
+      struct reg_or_immed *pred_val = entry->data;
+      if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
+          !pred_val->def ||
+          (cur_def && cur_def != pred_val->def)) {
+         needs_phi = true;
+         break;
+      }
+      cur_def = pred_val->def;
+   }
+
+   if (!needs_phi) {
+      interval->dst.def = cur_def;
+      interval->dst.flags = cur_def->flags;
+      return;
+   }
+
+   struct ir3_instruction *phi =
+      ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
+   struct ir3_register *dst = __ssa_dst(phi);
+   dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   dst->size = def->size;
+   dst->wrmask = def->wrmask;
+
+   dst->interval_start = def->interval_start;
+   dst->interval_end = def->interval_end;
+   dst->merge_set = def->merge_set;
+   dst->merge_set_offset = def->merge_set_offset;
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
+      src->size = def->size;
+      src->wrmask = def->wrmask;
+
+      if (state->visited) {
+         struct hash_entry *entry =
+            _mesa_hash_table_search(state->remap, def);
+         assert(entry);
+         struct reg_or_immed *new_val = entry->data;
+         set_src_val(src, new_val);
+      } else {
+         src->def = def;
+      }
+   }
+
+   interval->dst.def = dst;
+   interval->dst.flags = dst->flags;
+
+   ir3_instr_move_before_block(phi, block);
+}
+
+/* When spilling a block with a single predecessors, the pred may have other
+ * successors so we can't choose what's live in and we can't spill/restore
+ * anything. Just make the inserted intervals exactly match the predecessor. If
+ * it wasn't live in the predecessor then it must've already been spilled. Also,
+ * there are no phi nodes and no live-ins.
+ */
+static void
+spill_single_pred_live_in(struct ra_spill_ctx *ctx,
+                          struct ir3_block *block)
+{
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[reg->name];
+      struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
+      if (val)
+         interval->dst = *val;
+      else
+         ra_spill_ctx_remove(ctx, interval);
+   }
+}
+
+static void
+rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
+            struct ir3_block *block)
+{
+   if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
+      phi->flags |= IR3_INSTR_UNUSED;
+      return;
+   }
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct ir3_register *src = phi->srcs[i];
+      if (!src->def)
+         continue;
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, src->def);
+      assert(entry);
+      struct reg_or_immed *new_val = entry->data;
+      set_src_val(src, new_val);
+   }
+}
+
+static void
+spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_block *block)
+{
+   struct ir3_register *def = interval->interval.reg;
+
+   spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block);
+   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                         &ctx->reg_ctx.intervals, interval.node) {
+      if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
+         spill_live_out(ctx, interval, block);
+      }
+   }
+}
+
+static void
+reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+
+   reload_def(ctx, def, NULL, block);
+}
+
+static void
+reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   unsigned name;
+   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[name];
+      if (!interval->interval.inserted)
+         reload_live_out(ctx, reg, block);
+   }
+}
+
+static void
+update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   assert(!block->successors[1]);
+   struct ir3_block *succ = block->successors[0];
+   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+   
+   foreach_instr (instr, &succ->instr_list) {
+      if (instr->opc != OPC_META_PHI)
+         break;
+
+      struct ir3_register *def = instr->srcs[pred_idx]->def;
+      if (!def)
+         continue;
+
+      struct ra_spill_interval *interval = ctx->intervals[def->name];
+      if (!interval->interval.inserted)
+         continue;
+      set_src_val(instr->srcs[pred_idx], &interval->dst);
+   }
+}
+
+static void
+record_pred_live_out(struct ra_spill_ctx *ctx,
+                     struct ra_spill_interval *interval,
+                     struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+   BITSET_SET(state->live_out, def->name);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_pred_live_out(ctx, child, block, pred_idx);
+   }
+}
+
+static void
+record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (state->visited)
+         continue;
+
+      state->live_out = rzalloc_array(ctx, BITSET_WORD,
+                                      BITSET_WORDS(ctx->live->definitions_count));
+
+
+      rb_tree_foreach (struct ra_spill_interval, interval,
+                       &ctx->reg_ctx.intervals, interval.node) {
+         record_pred_live_out(ctx, interval, block, i);
+      }
+   }
+}
+
+static void
+record_live_out(struct ra_spill_ctx *ctx,
+                struct ra_spill_block_state *state,
+                struct ra_spill_interval *interval)
+{
+   if (!(interval->dst.flags & IR3_REG_SSA) ||
+       interval->dst.def) {
+      struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
+      *val = interval->dst;
+      _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
+   }
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_live_out(ctx, state, child);
+   }
+}
+
+static void
+record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   state->remap = _mesa_pointer_hash_table_create(ctx);
+
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      record_live_out(ctx, state, interval);
+   }
 }
 
 static void
@@ -312,12 +1692,14 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
 {
    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
    rb_tree_init(&ctx->reg_ctx.intervals);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
 
    unsigned name;
    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
                        ctx->live->definitions_count) {
       struct ir3_register *reg = ctx->live->definitions[name];
-      handle_live_in(ctx, reg);
+      handle_live_in(ctx, block, reg);
    }
 
    foreach_instr (instr, &block->instr_list) {
@@ -327,36 +1709,297 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
       handle_input_phi(ctx, instr);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling) {
+      if (block->predecessors_count == 1) {
+         spill_single_pred_live_in(ctx, block);
+      } else {
+         spill_live_ins(ctx, block);
+         reload_live_ins(ctx, block);
+         record_pred_live_outs(ctx, block);
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc != OPC_META_PHI)
+               break;
+            rewrite_phi(ctx, instr, block);
+         }
+         BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                             ctx->live->definitions_count) {
+            struct ir3_register *reg = ctx->live->definitions[name];
+            add_live_in_phi(ctx, reg, block);
+         }
+      }
+   } else {
+      update_max_pressure(ctx);
+   }
 
    foreach_instr (instr, &block->instr_list) {
+      di(instr, "processing");
+
       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
           instr->opc == OPC_META_TEX_PREFETCH)
          remove_input_phi(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
+         handle_pcopy(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_MOV &&
+               instr->dsts[0] == ctx->base_reg)
+         /* skip */;
       else
          handle_instr(ctx, instr);
    }
+
+   if (ctx->spilling && block->successors[0]) {
+      struct ra_spill_block_state *state =
+         &ctx->blocks[block->successors[0]->index];
+      if (state->visited) {
+         assert(!block->successors[1]);
+
+         spill_live_outs(ctx, block);
+         reload_live_outs(ctx, block);
+         update_live_out_phis(ctx, block);
+      }
+   }
+
+   if (ctx->spilling) {
+      record_live_outs(ctx, block);
+      ctx->blocks[block->index].visited = true;
+   }
+}
+
+static bool
+simplify_phi_node(struct ir3_instruction *phi)
+{
+   struct ir3_register *def = NULL;
+   foreach_src (src, phi) {
+      /* Ignore phi sources which point to the phi itself. */
+      if (src->def == phi->dsts[0])
+         continue;
+      /* If it's undef or it doesn't match the previous sources, bail */
+      if (!src->def || (def && def != src->def))
+         return false;
+      def = src->def;
+   }
+
+   phi->data = def;
+   phi->flags |= IR3_INSTR_UNUSED;
+   return true;
+}
+
+static void
+simplify_phi_srcs(struct ir3_instruction *instr)
+{
+   foreach_src (src, instr) {
+      if (src->def && src->def->instr->opc == OPC_META_PHI) {
+         struct ir3_instruction *phi = src->def->instr;
+         if (phi->data)
+            src->def = phi->data;
+      }
+   }
+}
+
+/* We insert phi nodes for all live-ins of loops in case we need to split the
+ * live range. This pass cleans that up for the case where the live range didn't
+ * actually need to be split.
+ */
+static void
+simplify_phi_nodes(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_PHI)
+            break;
+         instr->data = NULL;
+      }
+   }
+
+   bool progress;
+   do {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
+               continue;
+
+            simplify_phi_srcs(instr);
+         }
+
+         for (unsigned i = 0; i < 2; i++) {
+            struct ir3_block *succ = block->successors[i];
+            if (!succ)
+               continue;
+            foreach_instr (instr, &succ->instr_list) {
+               if (instr->opc != OPC_META_PHI)
+                  break;
+               if (instr->flags & IR3_INSTR_UNUSED)
+                  continue;
+
+               simplify_phi_srcs(instr);
+               progress |= simplify_phi_node(instr);
+            }
+         }
+      }
+   } while (progress);
+}
+
+static void
+unmark_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         instr->flags &= ~IR3_INSTR_UNUSED;
+      }
+   }
+}
+
+/* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
+ * which ones are dead along the way, so there's nothing to compute here.
+ */
+static void
+cleanup_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->flags & IR3_INSTR_UNUSED)
+            list_delinit(&instr->node);
+      }
+   }
+}
+
+/* Deal with merge sets after spilling. Spilling generally leaves the merge sets
+ * in a mess, and even if we properly cleaned up after ourselves, we would want
+ * to recompute the merge sets afterward anway. That's because
+ * spilling/reloading can "break up" phi webs and split/collect webs so that
+ * allocating them to the same register no longer gives any benefit. For
+ * example, imagine we have this:
+ *
+ * if (...) {
+ *    foo = ...
+ * } else {
+ *    bar = ...
+ * }
+ * baz = phi(foo, bar)
+ *
+ * and we spill "baz":
+ *
+ * if (...) {
+ *    foo = ...
+ *    spill(foo)
+ * } else {
+ *    bar = ...
+ *    spill(bar)
+ * }
+ * baz = reload()
+ *
+ * now foo, bar, and baz don't have to be allocated to the same register. How
+ * exactly the merge sets change can be complicated, so it's easier just to
+ * recompute them.
+ *
+ * However, there's a wrinkle in this: those same merge sets determine the
+ * register pressure, due to multiple values inhabiting the same register! And
+ * we assume that this sharing happens when spilling. Therefore we need a
+ * three-step procedure:
+ *
+ * 1. Drop the original merge sets.
+ * 2. Calculate which values *must* be merged, being careful to only use the
+ *    interval information which isn't trashed by spilling, and forcibly merge
+ *    them.
+ * 3. Let ir3_merge_regs() finish the job, including recalculating the
+ *    intervals.
+ */
+
+static void
+fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            dst->merge_set = NULL;
+            dst->merge_set_offset = 0;
+         }
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_SPLIT &&
+             instr->opc != OPC_META_COLLECT)
+            continue;
+
+         struct ir3_register *dst = instr->dsts[0];
+         ra_foreach_src (src, instr) {
+            if (!(src->flags & IR3_REG_KILL) &&
+                src->def->interval_start < dst->interval_end &&
+                dst->interval_start < src->def->interval_end) {
+               ir3_force_merge(dst, src->def,
+                               src->def->interval_start - dst->interval_start);
+            }
+         }
+      }
+   }
+
+   ir3_merge_regs(live, ir);
 }
 
 void
 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                   struct ir3_pressure *max_pressure)
 {
-   struct ra_spill_ctx ctx = {};
-   ctx.live = live;
-   ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
-   ctx.compiler = v->shader->compiler;
-   spill_ctx_init(&ctx);
+   struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, live);
 
    foreach_block (block, &v->ir->block_list) {
-      handle_block(&ctx, block);
+      handle_block(ctx, block);
    }
 
-   assert(ctx.cur_pressure.full == 0);
-   assert(ctx.cur_pressure.half == 0);
-   assert(ctx.cur_pressure.shared == 0);
+   assert(ctx->cur_pressure.full == 0);
+   assert(ctx->cur_pressure.half == 0);
+   assert(ctx->cur_pressure.shared == 0);
 
-   free(ctx.intervals);
-
-   *max_pressure = ctx.max_pressure;
+   *max_pressure = ctx->max_pressure;
+   ralloc_free(ctx);
+}
+
+bool
+ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+          struct ir3_liveness **live,
+          const struct ir3_pressure *limit_pressure)
+{
+   struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, *live);
+
+   ctx->spilling = true;
+
+   ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
+                               ctx->live->block_count);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
+
+   ctx->limit_pressure = *limit_pressure;
+   ctx->spill_slot = v->pvtmem_size;
+
+   add_base_reg(ctx, ir);
+   compute_next_distance(ctx, ir);
+
+   unmark_dead(ir);
+
+   foreach_block (block, &ir->block_list) {
+      handle_block(ctx, block);
+   }
+
+   simplify_phi_nodes(ir);
+
+   cleanup_dead(ir);
+
+   ir3_create_parallel_copies(ir);
+
+   /* After this point, we're done mutating the IR. Liveness has been trashed,
+    * so recalculate it. We'll need it for recalculating the merge sets.
+    */
+   ralloc_free(ctx->live);
+   *live = ir3_calc_liveness(v);
+
+   fixup_merge_sets(*live, ir);
+
+   v->pvtmem_size = ctx->spill_slot;
+   ralloc_free(ctx);
+
+   return true;
 }
diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c
index 4fe56b45c9c..08f2df4251a 100644
--- a/src/freedreno/ir3/ir3_validate.c
+++ b/src/freedreno/ir3/ir3_validate.c
@@ -187,7 +187,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
          /* end/chmask/etc are allowed to have different size sources */
       } else if (instr->opc == OPC_META_PARALLEL_COPY) {
          /* pcopy sources have to match with their destination but can have
-          * different size.
+          * different sizes from each other.
           */
       } else if (n > 0) {
          validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
@@ -303,6 +303,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
       case OPC_STL:
       case OPC_STP:
       case OPC_STLW:
+      case OPC_SPILL_MACRO:
          validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
          validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
          validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 40bdb26194d..0456bc59253 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -88,6 +88,7 @@ libfreedreno_ir3_files = files(
   'ir3_legalize.c',
   'ir3_liveness.c',
   'ir3_lower_parallelcopy.c',
+  'ir3_lower_spill.c',
   'ir3_lower_subgroups.c',
   'ir3_merge_regs.c',
   'ir3_nir.c',
diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
index 61c1a5b0cda..d311f70ae3b 100644
--- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
+++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
@@ -345,7 +345,6 @@ spec@glsl-1.50@execution@compatibility@vs-gs-ff-frag,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array,Crash
 spec@glsl-1.50@execution@geometry@end-primitive 0,Fail
-spec@glsl-1.50@execution@geometry@max-input-components,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop ffs,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop other,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_lines_adjacency ffs,Fail
@@ -385,11 +384,7 @@ spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triang
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip other,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
-spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
 spec@glsl-1.50@gs-max-output-components,Fail
 spec@intel_performance_query@intel_performance_query-issue_2235,Fail
 spec@khr_texture_compression_astc@array-gl@12x12 Block Dim,Fail