diff --git a/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/src/freedreno/ci/deqp-freedreno-a630-fails.txt index 18325d68824..e5790183e98 100644 --- a/src/freedreno/ci/deqp-freedreno-a630-fails.txt +++ b/src/freedreno/ci/deqp-freedreno-a630-fails.txt @@ -14,11 +14,6 @@ KHR-GL33.transform_feedback.query_vertex_separate_test,Fail # "*** Color comparison failed" KHR-GLES3.packed_depth_stencil.verify_read_pixels.depth24_stencil8,Fail -# "MESA: error: ir3_ra() failed!" -KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2,Fail -KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing5,Fail -KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing6,Fail - # "The values of resultStd[i] & 0xFFFFFFFE and resultFma[i] & 0xFFFFFFFE and resultCPU[i] & 0xFFFFFFFE are not bitwise equal for i = 0..99 " KHR-GLES31.core.gpu_shader5.fma_precision_float,Fail KHR-GLES31.core.gpu_shader5.fma_precision_vec2,Fail @@ -86,11 +81,6 @@ dEQP-VK.api.info.get_physical_device_properties2.properties,Fail dEQP-VK.api.object_management.alloc_callback_fail.device,Fail dEQP-VK.api.object_management.alloc_callback_fail.device_group,Fail -# "MESA: error: ir3_ra() failed!" -# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33 -dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite,Fail -dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store,Fail - # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019 # should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745 dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail @@ -98,10 +88,6 @@ dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail -# "MESA: error: ir3_ra() failed! -# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33 -dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array,Fail - # "deqp-vk: ../src/freedreno/vulkan/tu_cs.h:186: tu_cs_reserve: Assertion `tu_cs_get_space(cs) >= reserved_size' failed." # https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8841 dEQP-VK.spirv_assembly.instruction.compute.opphi.wide,Crash @@ -120,14 +106,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_si dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail -# "MESA: error: ir3_ra() failed!" -# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33 -# Needs spilling, or maybe some scheduling (though throwing a bit of nir_move/sink -# at it didn't help). -dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_inner_stride,Fail -dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_outer_stride,Fail -dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_strides,Fail - dEQP-VK.texture.filtering.2d.formats.d24_unorm_s8_uint_stencil.nearest,Fail dEQP-VK.texture.filtering.2d_array.formats.d24_unorm_s8_uint_stencil.d24_unorm_s8_uint_stencil_nearest,Fail dEQP-VK.texture.filtering.cube.formats.d24_unorm_s8_uint_stencil.nearest,Fail @@ -136,205 +114,6 @@ dEQP-VK.texture.filtering.unnormal.formats.d24_unorm_s8_uint_stencil.nearest,Fai # Broken on all drivers: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4582 dEQP-VK.wsi.display_control.register_device_event,Fail -# "MESA: error: ir3_ra() failed!" -# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33 -dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4,Fail -dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4,Fail -dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4,Fail -dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3,Fail -dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_store_cols,Fail -dEQP-VK.ssbo.layout.random.all_shared_buffer.5,Fail -dEQP-VK.ssbo.layout.random.nested_structs_arrays.0,Fail -dEQP-VK.ssbo.layout.random.nested_structs_arrays.17,Fail -dEQP-VK.ssbo.layout.random.scalar.19,Fail - bypass-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail bypass-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail bypass-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail diff --git a/src/freedreno/ci/deqp-freedreno-a630-skips.txt b/src/freedreno/ci/deqp-freedreno-a630-skips.txt index b3531a2c6ac..d95dac8a996 100644 --- a/src/freedreno/ci/deqp-freedreno-a630-skips.txt +++ b/src/freedreno/ci/deqp-freedreno-a630-skips.txt @@ -25,3 +25,8 @@ dEQP-VK.ubo.random.all_shared_buffer.48 # Still running after 3 hours, time is spent in batch_draw_tracking(). KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs + +# causes a hangcheck timeout on a630: +# msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A630: hangcheck detected gpu lockup rb 0! +dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite +dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index 4a0fb40bbd1..ee4524797bd 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -348,6 +348,9 @@ static const struct opc_info { OPC(6, OPC_GETSPID, getspid), OPC(6, OPC_GETWID, getwid), + OPC(6, OPC_SPILL_MACRO, spill.macro), + OPC(6, OPC_RELOAD_MACRO, reload.macro), + OPC(7, OPC_BAR, bar), OPC(7, OPC_FENCE, fence), /* clang-format on */ diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index 93162c103b4..f367d6a197d 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -308,6 +308,9 @@ typedef enum { OPC_LDG_A = _OPC(6, 55), OPC_STG_A = _OPC(6, 56), + OPC_SPILL_MACRO = _OPC(6, 57), + OPC_RELOAD_MACRO = _OPC(6, 58), + /* category 7: */ OPC_BAR = _OPC(7, 0), OPC_FENCE = _OPC(7, 1), diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 187f8e13d3e..e0c678b0534 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -89,6 +89,7 @@ struct ir3_merge_set { uint16_t alignment; unsigned interval_start; + unsigned spill_slot; unsigned regs_count; struct ir3_register **regs; @@ -202,6 +203,8 @@ struct ir3_register { */ struct ir3_register *tied; + unsigned spill_slot, next_use; + unsigned merge_set_offset; struct ir3_merge_set *merge_set; unsigned interval_start, interval_end; @@ -711,6 +714,17 @@ ir3_instr_move_after(struct ir3_instruction *instr, list_add(&instr->node, &before->node); } +/** + * Move 'instr' to the beginning of the block: + */ +static inline void +ir3_instr_move_before_block(struct ir3_instruction *instr, + struct ir3_block *block) +{ + list_delinit(&instr->node); + list_add(&instr->node, &block->instr_list); +} + void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps); void ir3_set_dst_type(struct ir3_instruction *instr, bool half); diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 29a59052f99..cf407e6b2b3 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -44,6 +44,7 @@ static const struct debug_named_value shader_debug_options[] = { {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"}, {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"}, {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"}, + {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"}, #ifdef DEBUG /* DEBUG-only options: */ {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"}, diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index cf2fe2ad221..afe6113b1e4 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -194,6 +194,7 @@ enum ir3_shader_debug { IR3_DBG_NOUBOOPT = BITFIELD_BIT(9), IR3_DBG_NOFP16 = BITFIELD_BIT(10), IR3_DBG_NOCACHE = BITFIELD_BIT(11), + IR3_DBG_SPILLALL = BITFIELD_BIT(12), /* DEBUG-only options: */ IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20), diff --git a/src/freedreno/ir3/ir3_lower_spill.c b/src/freedreno/ir3/ir3_lower_spill.c new file mode 100644 index 00000000000..265207105e9 --- /dev/null +++ b/src/freedreno/ir3/ir3_lower_spill.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2021 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_ra.h" + +/* The spilling pass leaves out a few details required to successfully operate + * ldp/stp: + * + * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores + * that and just spills/restores entire values, including arrays and values + * created for texture setup which can be more than 4 components. + * 2. The spiller doesn't add barrier dependencies needed for post-RA + * scheduling. + * + * The first one, in particular, is much easier to handle after RA because + * arrays and normal values can be treated the same way. Therefore this pass + * runs after RA, and handles both issues. This keeps the complexity out of the + * spiller. + */ + +static void +split_spill(struct ir3_instruction *spill) +{ + unsigned orig_components = spill->srcs[2]->uim_val; + + /* We don't handle splitting dependencies. */ + assert(spill->deps_count == 0); + + if (orig_components <= 4) { + if (spill->srcs[1]->flags & IR3_REG_ARRAY) { + spill->srcs[1]->wrmask = MASK(orig_components); + spill->srcs[1]->num = spill->srcs[1]->array.base; + spill->srcs[1]->flags &= ~IR3_REG_ARRAY; + } + return; + } + + for (unsigned comp = 0; comp < orig_components; comp += 4) { + unsigned components = MIN2(orig_components - comp, 4); + struct ir3_instruction *clone = ir3_instr_clone(spill); + ir3_instr_move_before(clone, spill); + + clone->srcs[1]->wrmask = MASK(components); + if (clone->srcs[1]->flags & IR3_REG_ARRAY) { + clone->srcs[1]->num = clone->srcs[1]->array.base + comp; + clone->srcs[1]->flags &= ~IR3_REG_ARRAY; + } + + clone->srcs[2]->uim_val = components; + clone->cat6.dst_offset += + comp * ((spill->srcs[1]->flags & IR3_REG_HALF) ? 2 : 4); + } + + list_delinit(&spill->node); +} + +static void +split_reload(struct ir3_instruction *reload) +{ + unsigned orig_components = reload->srcs[2]->uim_val; + + assert(reload->deps_count == 0); + + if (orig_components <= 4) { + if (reload->dsts[0]->flags & IR3_REG_ARRAY) { + reload->dsts[0]->wrmask = MASK(orig_components); + reload->dsts[0]->num = reload->dsts[0]->array.base; + reload->dsts[0]->flags &= ~IR3_REG_ARRAY; + } + return; + } + + for (unsigned comp = 0; comp < orig_components; comp += 4) { + unsigned components = MIN2(orig_components - comp, 4); + struct ir3_instruction *clone = ir3_instr_clone(reload); + ir3_instr_move_before(clone, reload); + + clone->dsts[0]->wrmask = MASK(components); + if (clone->dsts[0]->flags & IR3_REG_ARRAY) { + clone->dsts[0]->num = clone->dsts[0]->array.base + comp; + clone->dsts[0]->flags &= ~IR3_REG_ARRAY; + } + + clone->srcs[2]->uim_val = components; + clone->srcs[1]->uim_val += + comp * ((reload->dsts[0]->flags & IR3_REG_HALF) ? 2 : 4); + } + + list_delinit(&reload->node); +} + +static void +add_spill_reload_deps(struct ir3_block *block) +{ + struct ir3_instruction *last_spill = NULL; + + foreach_instr (instr, &block->instr_list) { + if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) && + last_spill) { + ir3_instr_add_dep(instr, last_spill); + } + + if (instr->opc == OPC_SPILL_MACRO) + last_spill = instr; + } + + + last_spill = NULL; + + foreach_instr_rev (instr, &block->instr_list) { + if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) && + last_spill) { + ir3_instr_add_dep(last_spill, instr); + } + + if (instr->opc == OPC_SPILL_MACRO) + last_spill = instr; + } +} + +bool +ir3_lower_spill(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + if (instr->opc == OPC_SPILL_MACRO) + split_spill(instr); + else if (instr->opc == OPC_RELOAD_MACRO) + split_reload(instr); + } + + add_spill_reload_deps(block); + + foreach_instr (instr, &block->instr_list) { + if (instr->opc == OPC_SPILL_MACRO) + instr->opc = OPC_STP; + else if (instr->opc == OPC_RELOAD_MACRO) + instr->opc = OPC_LDP; + } + } + + return true; +} diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c index bb88dbe8fc7..674bc648e03 100644 --- a/src/freedreno/ir3/ir3_merge_regs.c +++ b/src/freedreno/ir3/ir3_merge_regs.c @@ -198,6 +198,7 @@ get_merge_set(struct ir3_register *def) struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set); set->preferred_reg = ~0; set->interval_start = ~0; + set->spill_slot = ~0; set->size = reg_size(def); set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2; set->regs_count = 1; @@ -339,6 +340,19 @@ try_merge_defs(struct ir3_liveness *live, struct ir3_register *a, merge_merge_sets(a_set, b_set, b_set_offset); } +void +ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset) +{ + struct ir3_merge_set *a_set = get_merge_set(a); + struct ir3_merge_set *b_set = get_merge_set(b); + + if (a_set == b_set) + return; + + int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; + merge_merge_sets(a_set, b_set, b_set_offset); +} + static void coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi) { @@ -462,7 +476,7 @@ ir3_create_parallel_copies(struct ir3 *ir) } static void -index_merge_sets(struct ir3 *ir) +index_merge_sets(struct ir3_liveness *live, struct ir3 *ir) { unsigned offset = 0; foreach_block (block, &ir->block_list) { @@ -489,6 +503,8 @@ index_merge_sets(struct ir3 *ir) } } } + + live->interval_offset = offset; } #define RESET "\x1b[0m" @@ -559,7 +575,7 @@ ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir) } } - index_merge_sets(ir); + index_merge_sets(live, ir); if (ir3_shader_debug & IR3_DBG_RAMSGS) dump_merge_sets(ir); diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 9c320e0ed90..6870769f74d 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -1990,6 +1990,152 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) return (target - 1) * 2 * 4; } +static void +add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg, + bool merged_regs) +{ + unsigned size = reg_size(reg); + if (reg->flags & IR3_REG_HALF) + pressure->half += size; + if (!(reg->flags & IR3_REG_HALF) || merged_regs) + pressure->full += size; +} + +static void +dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) +{ +} + +static void +dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) +{ +} + +static void +dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent, + struct ir3_reg_interval *child) +{ +} + +/* Calculate the minimum possible limit on register pressure so that spilling + * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall. + */ + +static void +calc_min_limit_pressure(struct ir3_shader_variant *v, + struct ir3_liveness *live, + struct ir3_pressure *limit) +{ + struct ir3_block *start = ir3_start_block(v->ir); + struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx); + struct ir3_reg_interval *intervals = + rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count); + + ctx->interval_add = dummy_interval_add; + ctx->interval_delete = dummy_interval_delete; + ctx->interval_readd = dummy_interval_readd; + + limit->full = limit->half = 0; + + struct ir3_pressure cur_pressure = {0}; + foreach_instr (input, &start->instr_list) { + if (input->opc != OPC_META_INPUT && + input->opc != OPC_META_TEX_PREFETCH) + break; + + add_pressure(&cur_pressure, input->dsts[0], v->mergedregs); + } + + limit->full = MAX2(limit->full, cur_pressure.full); + limit->half = MAX2(limit->half, cur_pressure.half); + + foreach_instr (input, &start->instr_list) { + if (input->opc != OPC_META_INPUT && + input->opc != OPC_META_TEX_PREFETCH) + break; + + /* pre-colored inputs may have holes, which increases the pressure. */ + struct ir3_register *dst = input->dsts[0]; + if (dst->num != INVALID_REG) { + unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst); + if (dst->flags & IR3_REG_HALF) + limit->half = MAX2(limit->half, physreg); + if (!(dst->flags & IR3_REG_HALF) || v->mergedregs) + limit->full = MAX2(limit->full, physreg); + } + } + + foreach_block (block, &v->ir->block_list) { + rb_tree_init(&ctx->intervals); + + unsigned name; + BITSET_FOREACH_SET (name, live->live_in[block->index], + live->definitions_count) { + struct ir3_register *reg = live->definitions[name]; + ir3_reg_interval_init(&intervals[reg->name], reg); + ir3_reg_interval_insert(ctx, &intervals[reg->name]); + } + + foreach_instr (instr, &block->instr_list) { + ra_foreach_dst (dst, instr) { + ir3_reg_interval_init(&intervals[dst->name], dst); + } + /* phis and parallel copies can be deleted via spilling */ + + if (instr->opc == OPC_META_PHI) { + ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]); + continue; + } + + if (instr->opc == OPC_META_PARALLEL_COPY) + continue; + + cur_pressure = (struct ir3_pressure) {0}; + + ra_foreach_dst (dst, instr) { + if (dst->tied && !(dst->tied->flags & IR3_REG_KILL)) + add_pressure(&cur_pressure, dst, v->mergedregs); + } + + ra_foreach_src_rev (src, instr) { + /* We currently don't support spilling the parent of a source when + * making space for sources, so we have to keep track of the + * intervals and figure out the root of the tree to figure out how + * much space we need. + * + * TODO: We should probably support this in the spiller. + */ + struct ir3_reg_interval *interval = &intervals[src->def->name]; + while (interval->parent) + interval = interval->parent; + add_pressure(&cur_pressure, interval->reg, v->mergedregs); + + if (src->flags & IR3_REG_FIRST_KILL) + ir3_reg_interval_remove(ctx, &intervals[src->def->name]); + } + + limit->full = MAX2(limit->full, cur_pressure.full); + limit->half = MAX2(limit->half, cur_pressure.half); + + cur_pressure = (struct ir3_pressure) {0}; + + ra_foreach_dst (dst, instr) { + ir3_reg_interval_init(&intervals[dst->name], dst); + ir3_reg_interval_insert(ctx, &intervals[dst->name]); + add_pressure(&cur_pressure, dst, v->mergedregs); + } + + limit->full = MAX2(limit->full, cur_pressure.full); + limit->half = MAX2(limit->half, cur_pressure.half); + } + } + + /* Account for the base register, which needs to be available everywhere. */ + limit->full += 2; + + ralloc_free(ctx); +} + int ir3_ra(struct ir3_shader_variant *v) { @@ -2010,15 +2156,35 @@ ir3_ra(struct ir3_shader_variant *v) d("\thalf: %u", max_pressure.half); d("\tshared: %u", max_pressure.shared); - if (v->mergedregs) { - max_pressure.full += max_pressure.half; - max_pressure.half = 0; + /* TODO: calculate half/full limit correctly for CS with barrier */ + struct ir3_pressure limit_pressure; + limit_pressure.full = RA_FULL_SIZE; + limit_pressure.half = RA_HALF_SIZE; + limit_pressure.shared = RA_SHARED_SIZE; + + /* If requested, lower the limit so that spilling happens more often. */ + if (ir3_shader_debug & IR3_DBG_SPILLALL) + calc_min_limit_pressure(v, live, &limit_pressure); + + if (max_pressure.shared > limit_pressure.shared) { + /* TODO shared reg -> normal reg spilling */ + d("shared max pressure exceeded!"); + return 1; } - if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE || - max_pressure.shared > RA_SHARED_SIZE) { - d("max pressure exceeded!"); - return 1; + bool spilled = false; + if (max_pressure.full > limit_pressure.full || + max_pressure.half > limit_pressure.half) { + if (!v->shader->compiler->has_pvtmem) { + d("max pressure exceeded!"); + return 1; + } + d("max pressure exceeded, spilling!"); + IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure); + ir3_calc_pressure(v, live, &max_pressure); + assert(max_pressure.full <= limit_pressure.full && + max_pressure.half <= limit_pressure.half); + spilled = true; } struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); @@ -2054,19 +2220,20 @@ ir3_ra(struct ir3_shader_variant *v) for (unsigned i = 0; i < instr->dsts_count; i++) { instr->dsts[i]->flags &= ~IR3_REG_SSA; - /* Parallel copies of array registers copy the whole register, - * and we need some way to let the parallel copy code know - * that this was an array whose size is determined by - * reg->size. So keep the array flag on those. + /* Parallel copies of array registers copy the whole register, and + * we need some way to let the parallel copy code know that this was + * an array whose size is determined by reg->size. So keep the array + * flag on those. spill/reload also need to work on the entire + * array. */ - if (!is_meta(instr)) + if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO) instr->dsts[i]->flags &= ~IR3_REG_ARRAY; } for (unsigned i = 0; i < instr->srcs_count; i++) { instr->srcs[i]->flags &= ~IR3_REG_SSA; - if (!is_meta(instr)) + if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO) instr->srcs[i]->flags &= ~IR3_REG_ARRAY; } } @@ -2074,6 +2241,10 @@ ir3_ra(struct ir3_shader_variant *v) ir3_debug_print(v->ir, "AFTER: register allocation"); + if (spilled) { + IR3_PASS(v->ir, ir3_lower_spill); + } + ir3_lower_copies(v); ir3_debug_print(v->ir, "AFTER: ir3_lower_copies"); diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h index fcef6a908e1..4a7c9d7752a 100644 --- a/src/freedreno/ir3/ir3_ra.h +++ b/src/freedreno/ir3/ir3_ra.h @@ -137,6 +137,7 @@ ra_reg_is_dst(const struct ir3_register *reg) struct ir3_liveness { unsigned block_count; + unsigned interval_offset; DECLARE_ARRAY(struct ir3_register *, definitions); DECLARE_ARRAY(BITSET_WORD *, live_out); DECLARE_ARRAY(BITSET_WORD *, live_in); @@ -151,6 +152,9 @@ void ir3_create_parallel_copies(struct ir3 *ir); void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir); +void ir3_force_merge(struct ir3_register *a, struct ir3_register *b, + int b_offset); + struct ir3_pressure { unsigned full, half, shared; }; @@ -158,6 +162,12 @@ struct ir3_pressure { void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live, struct ir3_pressure *max_pressure); +bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v, + struct ir3_liveness **live, + const struct ir3_pressure *limit_pressure); + +bool ir3_lower_spill(struct ir3 *ir); + void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size, unsigned half_size, unsigned block_count); diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c index 54ae56b1beb..0b9c56a7680 100644 --- a/src/freedreno/ir3/ir3_spill.c +++ b/src/freedreno/ir3/ir3_spill.c @@ -26,62 +26,318 @@ #include "ir3_shader.h" /* - * This pass does one thing so far: + * This pass does two things: * * 1. Calculates the maximum register pressure. To do this, we need to use the - * exact same technique that RA uses for combining meta_split instructions - * with their sources, so that our calculation agrees with RA. - * - * It will also optionally spill registers once that's implemented. + * exact same technique that RA uses for combining meta_split instructions + * with their sources, so that our calculation agrees with RA. + * 2. Spills when the register pressure is exceeded a limit calculated by RA. + * The implementation is based on "Register Spilling and Live-Range Splitting + * for SSA-Form Programs" by Braun and Hack, although again care has to be + * taken to handle combining split/collect instructions. */ +struct reg_or_immed { + unsigned flags; + union { + struct ir3_register *def; + uint32_t uimm; + unsigned const_num; + }; +}; + struct ra_spill_interval { struct ir3_reg_interval interval; + + struct rb_node node; + struct rb_node half_node; + + /* The current SSA value/const/immed this source is mapped to. */ + struct reg_or_immed dst; + + /* When computing use distances we use the distance relative to the start + * of the block. So, for example, a value that's defined in cycle 5 of the + * block and used 6 cycles later will always have a next_use_distance of 11 + * until we reach that use. + */ + unsigned next_use_distance; + + /* Whether this value was reloaded and therefore doesn't need to be + * spilled again. Corresponds to the S set in the paper. + */ + bool already_spilled; + + /* We need to add sources early for accounting purposes, but we have to + * insert the reload code for them last. Keep track of whether this interval + * needs to be reloaded later. + */ + bool needs_reload; + + /* Keep track of whether this interval currently can't be spilled because: + * - It or one of its children is a source and we're making space for + * sources. + * - It is a destination and we're making space for destinations. + */ + bool cant_spill; +}; + +struct ra_spill_block_state { + unsigned *next_use_end; + unsigned *next_use_start; + + unsigned cycles; + + /* Map from SSA def to reg_or_immed it is mapped to at the end of the block. + * This map only contains values which we didn't spill, so it also serves as + * a record of the new live-out set for this block. + */ + struct hash_table *remap; + + /* For blocks whose successors are visited first (i.e. loop backedges), which + * values should be live at the end. + */ + BITSET_WORD *live_out; + + bool visited; }; struct ra_spill_ctx { struct ir3_reg_ctx reg_ctx; - struct ra_spill_interval *intervals; + struct ra_spill_interval **intervals; + unsigned intervals_count; + + /* rb tree of live intervals that we can spill, ordered by next-use distance. + * full_live_intervals contains the full+shared intervals in the merged_regs + * case. We use this list to determine what to spill. + */ + struct rb_tree full_live_intervals; + struct rb_tree half_live_intervals; struct ir3_pressure cur_pressure, max_pressure; + struct ir3_pressure limit_pressure; + + /* When spilling, we need to reserve a register to serve as the zero'd + * "base". For simplicity we reserve a register at the beginning so that it's + * always available. + */ + struct ir3_register *base_reg; + + /* Current pvtmem offset in bytes. */ + unsigned spill_slot; + struct ir3_liveness *live; const struct ir3_compiler *compiler; + + struct ra_spill_block_state *blocks; + + bool spilling; + + bool merged_regs; }; +static void +add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir) +{ + struct ir3_block *start = ir3_start_block(ir); + + /* We need to stick it after any meta instructions which need to be first. */ + struct ir3_instruction *after = NULL; + foreach_instr (instr, &start->instr_list) { + if (instr->opc != OPC_META_INPUT && + instr->opc != OPC_META_TEX_PREFETCH) { + after = instr; + break; + } + } + + struct ir3_instruction *mov = create_immed(start, 0); + + if (after) + ir3_instr_move_before(mov, after); + + ctx->base_reg = mov->dsts[0]; + + /* We don't create an interval, etc. for the base reg, so just lower the + * register pressure limit to account for it. We assume it's always + * available for simplicity. + */ + ctx->limit_pressure.full -= reg_size(ctx->base_reg); +} + + +/* Compute the number of cycles per instruction used for next-use-distance + * analysis. This is just approximate, obviously. + */ +static unsigned +instr_cycles(struct ir3_instruction *instr) +{ + if (instr->opc == OPC_META_PARALLEL_COPY) { + unsigned cycles = 0; + for (unsigned i = 0; i < instr->dsts_count; i++) { + if (!instr->srcs[i]->def || + instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) { + cycles += reg_elems(instr->srcs[i]); + } + } + + return cycles; + } + + if (instr->opc == OPC_META_COLLECT) { + unsigned cycles = 0; + for (unsigned i = 0; i < instr->srcs_count; i++) { + if (!instr->srcs[i]->def || + instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) { + cycles++; + } + } + + return cycles; + } + + if (is_meta(instr)) + return 0; + + return 1 + instr->repeat; +} + +static bool +compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block, + unsigned *tmp_next_use) +{ + struct ra_spill_block_state *state = &ctx->blocks[block->index]; + memcpy(tmp_next_use, state->next_use_end, + ctx->live->definitions_count * sizeof(*tmp_next_use)); + + unsigned cycle = state->cycles; + foreach_instr_rev (instr, &block->instr_list) { + ra_foreach_dst (dst, instr) { + dst->next_use = tmp_next_use[dst->name]; + } + + ra_foreach_src (src, instr) { + src->next_use = tmp_next_use[src->def->name]; + } + + cycle -= instr_cycles(instr); + + if (instr->opc == OPC_META_PARALLEL_COPY) { + ra_foreach_src_n (src, i, instr) { + if (src->def->merge_set == instr->dsts[i]->merge_set && + src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) { + tmp_next_use[src->def->name] = + tmp_next_use[instr->dsts[i]->name]; + } else { + tmp_next_use[src->def->name] = cycle; + } + } + } else if (instr->opc != OPC_META_PHI) { + ra_foreach_src (src, instr) { + tmp_next_use[src->def->name] = cycle; + } + } + + ra_foreach_dst (dst, instr) { + tmp_next_use[dst->name] = UINT_MAX; + } + } + + memcpy(state->next_use_start, tmp_next_use, + ctx->live->definitions_count * sizeof(*tmp_next_use)); + + bool progress = false; + for (unsigned i = 0; i < block->predecessors_count; i++) { + const struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index]; + + /* Add a large-enough distance in front of edges exiting the loop so that + * variables that are live-through the loop but not used inside it are + * prioritized for spilling, as per the paper. This just needs to be + * larger than the longest path through the loop. + */ + bool loop_exit = pred->loop_depth < block->loop_depth; + unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0); + + for (unsigned j = 0; j < ctx->live->definitions_count; j++) { + if (state->next_use_start[j] < UINT_MAX && + state->next_use_start[j] + block_distance < + pred_state->next_use_end[j]) { + pred_state->next_use_end[j] = state->next_use_start[j] + + block_distance; + progress = true; + } + } + + foreach_instr (phi, &block->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + if (!phi->srcs[i]->def) + continue; + unsigned src = phi->srcs[i]->def->name; + if (phi->dsts[0]->next_use < UINT_MAX && + phi->dsts[0]->next_use + block_distance < + pred_state->next_use_end[src]) { + pred_state->next_use_end[src] = phi->dsts[0]->next_use + + block_distance; + progress = true; + } + } + } + + return progress; +} + +static void +compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir) +{ + for (unsigned i = 0; i < ctx->live->block_count; i++) { + ctx->blocks[i].next_use_start = + ralloc_array(ctx, unsigned, ctx->live->definitions_count); + ctx->blocks[i].next_use_end = + ralloc_array(ctx, unsigned, ctx->live->definitions_count); + + for (unsigned j = 0; j < ctx->live->definitions_count; j++) { + ctx->blocks[i].next_use_start[j] = UINT_MAX; + ctx->blocks[i].next_use_end[j] = UINT_MAX; + } + } + + foreach_block (block, &ir->block_list) { + struct ra_spill_block_state *state = &ctx->blocks[block->index]; + state->cycles = 0; + foreach_instr (instr, &block->instr_list) { + state->cycles += instr_cycles(instr); + foreach_dst (dst, instr) { + dst->spill_slot = ~0; + } + } + } + + unsigned *tmp_next_use = + ralloc_array(ctx, unsigned, ctx->live->definitions_count); + + bool progress = true; + while (progress) { + progress = false; + foreach_block_rev (block, &ir->block_list) { + progress |= compute_block_next_distance(ctx, block, tmp_next_use); + } + } +} + static void ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg) { ir3_reg_interval_init(&interval->interval, reg); -} - -static void -ra_pressure_add(struct ir3_pressure *pressure, - struct ra_spill_interval *interval) -{ - unsigned size = reg_size(interval->interval.reg); - if (interval->interval.reg->flags & IR3_REG_SHARED) - pressure->shared += size; - else if (interval->interval.reg->flags & IR3_REG_HALF) - pressure->half += size; - else - pressure->full += size; -} - -static void -ra_pressure_sub(struct ir3_pressure *pressure, - struct ra_spill_interval *interval) -{ - unsigned size = reg_size(interval->interval.reg); - if (interval->interval.reg->flags & IR3_REG_SHARED) - pressure->shared -= size; - else if (interval->interval.reg->flags & IR3_REG_HALF) - pressure->half -= size; - else - pressure->full -= size; + interval->dst.flags = reg->flags; + interval->dst.def = reg; + interval->already_spilled = false; + interval->needs_reload = false; + interval->cant_spill = false; } static struct ra_spill_interval * @@ -90,19 +346,66 @@ ir3_reg_interval_to_interval(struct ir3_reg_interval *interval) return rb_node_data(struct ra_spill_interval, interval, interval); } +static struct ra_spill_interval * +ra_spill_interval_root(struct ra_spill_interval *interval) +{ + struct ir3_reg_interval *ir3_interval = &interval->interval; + while (ir3_interval->parent) + ir3_interval = ir3_interval->parent; + return ir3_reg_interval_to_interval(ir3_interval); +} + static struct ra_spill_ctx * ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx) { return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx); } +static int +ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b) +{ + const struct ra_spill_interval *a = + rb_node_data(const struct ra_spill_interval, _a, node); + const struct ra_spill_interval *b = + rb_node_data(const struct ra_spill_interval, _b, node); + return a->next_use_distance - b->next_use_distance; +} + +static int +ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b) +{ + const struct ra_spill_interval *a = + rb_node_data(const struct ra_spill_interval, _a, half_node); + const struct ra_spill_interval *b = + rb_node_data(const struct ra_spill_interval, _b, half_node); + return a->next_use_distance - b->next_use_distance; +} + static void interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval) { struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval); struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx); - ra_pressure_add(&ctx->cur_pressure, interval); + unsigned size = reg_size(interval->interval.reg); + if (interval->interval.reg->flags & IR3_REG_SHARED) { + ctx->cur_pressure.shared += size; + } else { + if (interval->interval.reg->flags & IR3_REG_HALF) { + ctx->cur_pressure.half += size; + if (ctx->spilling) { + rb_tree_insert(&ctx->half_live_intervals, &interval->half_node, + ra_spill_interval_half_cmp); + } + } + if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) { + ctx->cur_pressure.full += size; + if (ctx->spilling) { + rb_tree_insert(&ctx->full_live_intervals, &interval->node, + ra_spill_interval_cmp); + } + } + } } static void @@ -111,7 +414,23 @@ interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval) struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval); struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx); - ra_pressure_sub(&ctx->cur_pressure, interval); + unsigned size = reg_size(interval->interval.reg); + if (interval->interval.reg->flags & IR3_REG_SHARED) { + ctx->cur_pressure.shared -= size; + } else { + if (interval->interval.reg->flags & IR3_REG_HALF) { + ctx->cur_pressure.half -= size; + if (ctx->spilling) { + rb_tree_remove(&ctx->half_live_intervals, &interval->half_node); + } + } + if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) { + ctx->cur_pressure.full -= size; + if (ctx->spilling) { + rb_tree_remove(&ctx->full_live_intervals, &interval->node); + } + } + } } static void @@ -122,8 +441,22 @@ interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent, } static void -spill_ctx_init(struct ra_spill_ctx *ctx) +spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v, + struct ir3_liveness *live) { + ctx->live = live; + ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *, + ctx->live->definitions_count); + struct ra_spill_interval *intervals = + rzalloc_array(ctx, struct ra_spill_interval, + ctx->live->definitions_count); + for (unsigned i = 0; i < ctx->live->definitions_count; i++) + ctx->intervals[i] = &intervals[i]; + + ctx->intervals_count = ctx->live->definitions_count; + ctx->compiler = v->shader->compiler; + ctx->merged_regs = v->mergedregs; + rb_tree_init(&ctx->reg_ctx.intervals); ctx->reg_ctx.interval_add = interval_add; ctx->reg_ctx.interval_delete = interval_delete; @@ -147,18 +480,21 @@ ra_spill_ctx_remove(struct ra_spill_ctx *ctx, static void init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) { - struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + struct ra_spill_interval *interval = ctx->intervals[dst->name]; ra_spill_interval_init(interval, dst); + if (ctx->spilling) + interval->next_use_distance = dst->next_use; } static void insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) { - struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + struct ra_spill_interval *interval = ctx->intervals[dst->name]; if (interval->interval.inserted) return; ra_spill_ctx_insert(ctx, interval); + interval->cant_spill = true; /* For precolored inputs, make sure we leave enough registers to allow for * holes in the inputs. It can happen that the binning shader has a lower @@ -179,14 +515,26 @@ insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) } } +static void +insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src) +{ + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; + + ra_spill_interval_root(interval)->cant_spill = true; + + if (interval->interval.inserted) + return; + + ra_spill_ctx_insert(ctx, interval); + interval->needs_reload = true; + interval->already_spilled = true; +} + static void remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src) { - if (!(src->flags & IR3_REG_FIRST_KILL)) - return; - - struct ra_spill_interval *interval = &ctx->intervals[src->def->name]; + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; if (!interval->interval.inserted || interval->interval.parent || !rb_tree_is_empty(&interval->interval.children)) @@ -199,10 +547,7 @@ static void remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src) { - if (!(src->flags & IR3_REG_FIRST_KILL)) - return; - - struct ra_spill_interval *interval = &ctx->intervals[src->def->name]; + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; if (!interval->interval.inserted) return; @@ -210,10 +555,17 @@ remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, ra_spill_ctx_remove(ctx, interval); } +static void +finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) +{ + struct ra_spill_interval *interval = ctx->intervals[dst->name]; + interval->cant_spill = false; +} + static void remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) { - struct ra_spill_interval *interval = &ctx->intervals[dst->name]; + struct ra_spill_interval *interval = ctx->intervals[dst->name]; if (!interval->interval.inserted) return; @@ -221,6 +573,361 @@ remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst) ra_spill_ctx_remove(ctx, interval); } +static void +update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src) +{ + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; + + assert(interval->interval.inserted); + + interval->next_use_distance = src->next_use; + + /* If this node is inserted in one of the trees, then it needs to be resorted + * as its key has changed. + */ + if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) { + if (src->flags & IR3_REG_HALF) { + rb_tree_remove(&ctx->half_live_intervals, &interval->half_node); + rb_tree_insert(&ctx->half_live_intervals, &interval->half_node, + ra_spill_interval_half_cmp); + } + if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) { + rb_tree_remove(&ctx->full_live_intervals, &interval->node); + rb_tree_insert(&ctx->full_live_intervals, &interval->node, + ra_spill_interval_cmp); + } + } +} + +static unsigned +get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg) +{ + if (reg->merge_set) { + if (reg->merge_set->spill_slot == ~0) { + reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot, + reg->merge_set->alignment); + ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2; + } + return reg->merge_set->spill_slot + reg->merge_set_offset * 2; + } else { + if (reg->spill_slot == ~0) { + reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg)); + ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2; + } + return reg->spill_slot; + } +} + +static void +set_src_val(struct ir3_register *src, const struct reg_or_immed *val) +{ + if (val->flags & IR3_REG_IMMED) { + src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF); + src->uim_val = val->uimm; + src->def = NULL; + } else if (val->flags & IR3_REG_CONST) { + src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF); + src->num = val->const_num; + src->def = NULL; + } else { + src->def = val->def; + } +} + +static struct ir3_register * +materialize_pcopy_src(const struct reg_or_immed *src, + struct ir3_instruction *instr, + struct ir3_block *block) +{ + struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); + struct ir3_register *dst = __ssa_dst(mov); + dst->flags |= src->flags & IR3_REG_HALF; + struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags); + set_src_val(mov_src, src); + mov->cat1.src_type = mov->cat1.dst_type = + (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + + if (instr) + ir3_instr_move_before(mov, instr); + return dst; +} + +static void +spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val, + unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block) +{ + struct ir3_register *reg; + + /* If spilling an immed/const pcopy src, we need to actually materialize it + * first with a mov. + */ + if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) { + reg = materialize_pcopy_src(val, instr, block); + } else { + reg = val->def; + } + + d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name, + spill_slot); + + unsigned elems = reg_elems(reg); + struct ir3_instruction *spill = + ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3); + ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg; + unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED | + IR3_REG_CONST | IR3_REG_SSA | + IR3_REG_ARRAY); + struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags); + ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems; + spill->cat6.dst_offset = spill_slot; + spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + + src->def = reg; + if (reg->flags & IR3_REG_ARRAY) { + src->size = reg->size; + src->array.id = reg->array.id; + src->array.offset = 0; + } else { + src->wrmask = reg->wrmask; + } + + if (instr) + ir3_instr_move_before(spill, instr); +} + +static void +spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval, + struct ir3_instruction *instr, struct ir3_block *block) +{ + spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg), + instr, block); +} + +/* This is similar to "limit" in the paper. */ +static void +limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) +{ + if (ctx->cur_pressure.half > ctx->limit_pressure.half) { + d("cur half pressure %u exceeds %u", ctx->cur_pressure.half, + ctx->limit_pressure.half); + rb_tree_foreach_safe (struct ra_spill_interval, interval, + &ctx->half_live_intervals, half_node) { + d("trying ssa_%u:%u", interval->interval.reg->instr->serialno, + interval->interval.reg->name); + if (!interval->cant_spill) { + if (!interval->already_spilled) + spill_interval(ctx, interval, instr, instr->block); + ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval); + if (ctx->cur_pressure.half <= ctx->limit_pressure.half) + break; + } + } + + assert(ctx->cur_pressure.half <= ctx->limit_pressure.half); + } + + if (ctx->cur_pressure.full > ctx->limit_pressure.full) { + d("cur full pressure %u exceeds %u", ctx->cur_pressure.full, + ctx->limit_pressure.full); + rb_tree_foreach_safe (struct ra_spill_interval, interval, + &ctx->full_live_intervals, node) { + d("trying ssa_%u:%u", interval->interval.reg->instr->serialno, + interval->interval.reg->name); + if (!interval->cant_spill) { + if (!interval->already_spilled) + spill_interval(ctx, interval, instr, instr->block); + ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval); + if (ctx->cur_pressure.full <= ctx->limit_pressure.full) + break; + } else { + d("can't spill"); + } + } + + assert(ctx->cur_pressure.full <= ctx->limit_pressure.full); + } +} + +/* There's a corner case where we reload a value which has overlapping live + * values already reloaded, either because it's the child of some other interval + * that was already reloaded or some of its children have already been + * reloaded. Because RA only expects overlapping source/dest intervals for meta + * instructions (split/collect), and we don't want to add register pressure by + * creating an entirely separate value, we need to add splits and collects to + * deal with this case. These splits/collects have to also have correct merge + * set information, so that it doesn't result in any actual code or register + * pressure in practice. + */ + +static void +add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def, + unsigned offset) +{ + def->merge_set = set; + def->merge_set_offset = offset; + def->interval_start = set->interval_start + offset; + def->interval_end = set->interval_start + offset + reg_size(def); +} + +static struct ir3_register * +split(struct ir3_register *def, unsigned offset, + struct ir3_instruction *after, struct ir3_block *block) +{ + if (reg_elems(def) == 1) { + assert(offset == 0); + return def; + } + + assert(!(def->flags & IR3_REG_ARRAY)); + assert(def->merge_set); + struct ir3_instruction *split = + ir3_instr_create(after->block, OPC_META_SPLIT, 1, 1); + struct ir3_register *dst = __ssa_dst(split); + dst->flags |= def->flags & IR3_REG_HALF; + struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags); + src->wrmask = def->wrmask; + src->def = def; + add_to_merge_set(def->merge_set, dst, + def->merge_set_offset + offset * reg_elem_size(def)); + if (after) + ir3_instr_move_before(split, after); + return dst; +} + +static struct ir3_register * +extract(struct ir3_register *parent_def, unsigned offset, unsigned elems, + struct ir3_instruction *after, struct ir3_block *block) +{ + if (offset == 0 && elems == reg_elems(parent_def)) + return parent_def; + + struct ir3_instruction *collect = + ir3_instr_create(after->block, OPC_META_COLLECT, 1, elems); + struct ir3_register *dst = __ssa_dst(collect); + dst->flags |= parent_def->flags & IR3_REG_HALF; + dst->wrmask = MASK(elems); + add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset); + + for (unsigned i = 0; i < elems; i++) { + ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = + split(parent_def, offset + i, after, block); + } + + if (after) + ir3_instr_move_before(collect, after); + return dst; +} + +static struct ir3_register * +reload(struct ra_spill_ctx *ctx, struct ir3_register *reg, + struct ir3_instruction *after, struct ir3_block *block) +{ + unsigned spill_slot = get_spill_slot(ctx, reg); + + d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name, + spill_slot); + + unsigned elems = reg_elems(reg); + struct ir3_instruction *reload = + ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3); + struct ir3_register *dst = __ssa_dst(reload); + dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY); + ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg; + struct ir3_register *offset_reg = + ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED); + offset_reg->uim_val = spill_slot; + ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems; + reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + + if (reg->flags & IR3_REG_ARRAY) { + dst->array.offset = 0; + dst->array.id = reg->array.id; + dst->size = reg->size; + } else { + dst->wrmask = MASK(elems); + } + + dst->merge_set = reg->merge_set; + dst->merge_set_offset = reg->merge_set_offset; + dst->interval_start = reg->interval_start; + dst->interval_end = reg->interval_end; + + if (after) + ir3_instr_move_before(reload, after); + + return dst; +} + +static void +rewrite_src_interval(struct ra_spill_ctx *ctx, + struct ra_spill_interval *interval, + struct ir3_register *def, + struct ir3_instruction *instr, + struct ir3_block *block) +{ + interval->dst.flags = def->flags; + interval->dst.def = def; + interval->needs_reload = false; + + rb_tree_foreach (struct ra_spill_interval, child, + &interval->interval.children, interval.node) { + struct ir3_register *child_reg = child->interval.reg; + struct ir3_register *child_def = + extract(def, (child_reg->interval_start - + interval->interval.reg->interval_start) / reg_elem_size(def), + reg_elems(child_reg), instr, block); + rewrite_src_interval(ctx, child, child_def, instr, block); + } +} + +static void +reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_instruction *instr, struct ir3_block *block) +{ + unsigned elems = reg_elems(def); + struct ra_spill_interval *interval = ctx->intervals[def->name]; + + struct ir3_reg_interval *ir3_parent = interval->interval.parent; + + if (ir3_parent) { + struct ra_spill_interval *parent = + ir3_reg_interval_to_interval(ir3_parent); + if (!parent->needs_reload) { + interval->dst.flags = def->flags; + interval->dst.def = extract( + parent->dst.def, (def->interval_start - parent->dst.def->interval_start) / + reg_elem_size(def), elems, instr, block); + return; + } + } + + struct ir3_register *dst = reload(ctx, def, instr, block); + + rewrite_src_interval(ctx, interval, dst, instr, block); +} + +static void +reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, + struct ir3_register *src) +{ + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; + + if (interval->needs_reload) { + reload_def(ctx, src->def, instr, instr->block); + } + + ra_spill_interval_root(interval)->cant_spill = false; +} + +static void +rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, + struct ir3_register *src) +{ + struct ra_spill_interval *interval = ctx->intervals[src->def->name]; + + set_src_val(src, &interval->dst); +} + static void update_max_pressure(struct ra_spill_ctx *ctx) { @@ -240,12 +947,15 @@ update_max_pressure(struct ra_spill_ctx *ctx) static void handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) { - di(instr, "processing"); - ra_foreach_dst (dst, instr) { init_dst(ctx, dst); } + if (ctx->spilling) { + ra_foreach_src (src, instr) + insert_src(ctx, src); + } + /* Handle tied destinations. If a destination is tied to a source and that * source is live-through, then we need to allocate a new register for the * destination which is live-through itself and cannot overlap the @@ -258,7 +968,17 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) insert_dst(ctx, dst); } - update_max_pressure(ctx); + if (ctx->spilling) + limit(ctx, instr); + else + update_max_pressure(ctx); + + if (ctx->spilling) { + ra_foreach_src (src, instr) { + reload_src(ctx, instr, src); + update_src_next_use(ctx, src); + } + } ra_foreach_src (src, instr) { if (src->flags & IR3_REG_FIRST_KILL) @@ -269,13 +989,29 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) insert_dst(ctx, dst); } - update_max_pressure(ctx); + if (ctx->spilling) + limit(ctx, instr); + else + update_max_pressure(ctx); - for (unsigned i = 0; i < instr->srcs_count; i++) { - if (ra_reg_is_src(instr->srcs[i]) && - (instr->srcs[i]->flags & IR3_REG_FIRST_KILL)) - remove_src(ctx, instr, instr->srcs[i]); + /* We have to remove sources before rewriting them so that we can lookup the + * interval to remove before the source itself is changed. + */ + ra_foreach_src (src, instr) { + if (src->flags & IR3_REG_FIRST_KILL) + remove_src(ctx, instr, src); } + + if (ctx->spilling) { + ra_foreach_src (src, instr) { + rewrite_src(ctx, instr, src); + } + } + + ra_foreach_dst (dst, instr) { + finish_dst(ctx, dst); + } + for (unsigned i = 0; i < instr->dsts_count; i++) { if (ra_reg_is_dst(instr->dsts[i]) && (instr->dsts[i]->flags & IR3_REG_UNUSED)) @@ -283,28 +1019,672 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) } } +static struct ra_spill_interval * +create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def) +{ + unsigned name = ctx->intervals_count++; + unsigned offset = ctx->live->interval_offset; + + /* This is kinda hacky, but we need to create a fake SSA def here that is + * only used as part of the pcopy accounting. See below. + */ + struct ir3_register *reg = rzalloc(ctx, struct ir3_register); + *reg = *def; + reg->name = name; + reg->interval_start = offset; + reg->interval_end = offset + reg_size(def); + reg->merge_set = NULL; + + ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *, + ctx->intervals_count); + struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval); + ra_spill_interval_init(interval, reg); + ctx->intervals[name] = interval; + ctx->live->interval_offset += reg_size(def); + return interval; +} + +/* In the sequence of copies generated (see below), would this source be killed? + */ +static bool +is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n) +{ + struct ir3_register *src = pcopy->srcs[src_n]; + if (!(src->flags & IR3_REG_KILL)) + return false; + for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) { + if (pcopy->srcs[j]->def == src->def) + return false; + } + return true; +} + +/* Parallel copies are different from normal instructions. The sources together + * may be larger than the entire register file, so we cannot just reload every + * source like normal, and indeed that probably wouldn't be a great idea. + * Instead we essentially need to lower the parallel copy to "copies," just like + * in the normal CSSA construction, although we implement the copies by + * reloading and then possibly spilling values. We essentially just shuffle + * around the sources until each source either (a) is live or (b) has the same + * spill slot as its corresponding destination. We do this by decomposing the + * copy into a series of copies, so: + * + * a, b, c = d, e, f + * + * becomes: + * + * d' = d + * e' = e + * f' = f + * a = d' + * b = e' + * c = f' + * + * the temporary SSA values d', e', and f' never actually show up in the result. + * They are only used for our internal accounting. They may, however, have their + * own spill slot created for them. Similarly, we don't actually emit any copy + * instructions, although we emit the spills/reloads that *would've* been + * required if those copies were there. + * + * TODO: in order to reduce the number of temporaries and therefore spill slots, + * we could instead do a more complicated analysis that considers the location + * transfer graph. + * + * In addition, we actually remove the parallel copy and rewrite all its uses + * (in the phi nodes) rather than rewrite its sources at the end. Recreating it + * later turns out to be easier than keeping it up-to-date throughout this pass, + * since we may have to remove entries for phi sources that are spilled and add + * entries for live-outs that are spilled and reloaded, which can happen here + * and then possibly be undone or done again when processing live-ins of the + * successor block. + */ + +static void +handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy) +{ + foreach_dst (dst, pcopy) { + struct ra_spill_interval *dst_interval = ctx->intervals[dst->name]; + ra_spill_interval_init(dst_interval, dst); + } + + foreach_src_n (src, i, pcopy) { + d("processing src %u", i); + struct ir3_register *dst = pcopy->dsts[i]; + + /* Skip the intermediate copy for cases where the source is merged with + * the destination. Crucially this means that we also don't reload/spill + * it if it's been spilled, because it shares the same spill slot. + */ + if (src->def && src->def->merge_set && + src->def->merge_set == dst->merge_set && + src->def->merge_set_offset == dst->merge_set_offset) { + struct ra_spill_interval *src_interval = ctx->intervals[src->def->name]; + struct ra_spill_interval *dst_interval = ctx->intervals[dst->name]; + if (src_interval->interval.inserted) { + update_src_next_use(ctx, src); + if (is_last_pcopy_src(pcopy, i)) + ra_spill_ctx_remove(ctx, src_interval); + dst_interval->cant_spill = true; + ra_spill_ctx_insert(ctx, dst_interval); + limit(ctx, pcopy); + dst_interval->cant_spill = false; + dst_interval->dst = src_interval->dst; + } + } else if (src->def) { + struct ra_spill_interval *temp_interval = + create_temp_interval(ctx, dst); + struct ir3_register *temp = temp_interval->interval.reg; + temp_interval->next_use_distance = src->next_use; + + insert_src(ctx, src); + limit(ctx, pcopy); + reload_src(ctx, pcopy, src); + update_src_next_use(ctx, src); + if (is_last_pcopy_src(pcopy, i)) + remove_src(ctx, pcopy, src); + struct ra_spill_interval *src_interval = + ctx->intervals[src->def->name]; + temp_interval->dst = src_interval->dst; + + temp_interval->cant_spill = true; + ra_spill_ctx_insert(ctx, temp_interval); + limit(ctx, pcopy); + temp_interval->cant_spill = false; + + src->flags = temp->flags; + src->def = temp; + } + } + + d("done with pcopy srcs"); + + foreach_src_n (src, i, pcopy) { + struct ir3_register *dst = pcopy->dsts[i]; + + if (src->def && src->def->merge_set && + src->def->merge_set == dst->merge_set && + src->def->merge_set_offset == dst->merge_set_offset) + continue; + + struct ra_spill_interval *dst_interval = ctx->intervals[dst->name]; + + if (!src->def) { + dst_interval->cant_spill = true; + ra_spill_ctx_insert(ctx, dst_interval); + limit(ctx, pcopy); + dst_interval->cant_spill = false; + + assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED)); + if (src->flags & IR3_REG_CONST) { + dst_interval->dst.flags = src->flags; + dst_interval->dst.const_num = src->num; + } else { + dst_interval->dst.flags = src->flags; + dst_interval->dst.uimm = src->uim_val; + } + } else { + struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name]; + + insert_src(ctx, src); + limit(ctx, pcopy); + reload_src(ctx, pcopy, src); + remove_src(ctx, pcopy, src); + + dst_interval->dst = temp_interval->dst; + ra_spill_ctx_insert(ctx, dst_interval); + } + } + + pcopy->flags |= IR3_INSTR_UNUSED; +} + static void handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) { init_dst(ctx, instr->dsts[0]); insert_dst(ctx, instr->dsts[0]); + finish_dst(ctx, instr->dsts[0]); } static void remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr) { - ra_foreach_src (src, instr) - remove_src(ctx, instr, src); + if (instr->opc == OPC_META_TEX_PREFETCH) { + ra_foreach_src (src, instr) + remove_src(ctx, instr, src); + } if (instr->dsts[0]->flags & IR3_REG_UNUSED) remove_dst(ctx, instr->dsts[0]); } static void -handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def) +handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block, + struct ir3_register *def) { - struct ra_spill_interval *interval = &ctx->intervals[def->name]; + struct ra_spill_interval *interval = ctx->intervals[def->name]; ra_spill_interval_init(interval, def); - insert_dst(ctx, def); + if (ctx->spilling) { + interval->next_use_distance = + ctx->blocks[block->index].next_use_start[def->name]; + } + + ra_spill_ctx_insert(ctx, interval); +} + +static bool +is_live_in_phi(struct ir3_register *def, struct ir3_block *block) +{ + return def->instr->opc == OPC_META_PHI && def->instr->block == block; +} + +static bool +is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block, unsigned pred_idx) +{ + struct ir3_block *pred = block->predecessors[pred_idx]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + if (is_live_in_phi(def, block)) { + def = def->instr->srcs[pred_idx]->def; + if (!def) + return false; + } + + return _mesa_hash_table_search(state->remap, def); +} + +static bool +is_live_in_undef(struct ir3_register *def, + struct ir3_block *block, unsigned pred_idx) +{ + if (!is_live_in_phi(def, block)) + return false; + + return !def->instr->srcs[pred_idx]->def; +} + +static struct reg_or_immed * +read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block, unsigned pred_idx) +{ + struct ir3_block *pred = block->predecessors[pred_idx]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + + if (is_live_in_phi(def, block)) { + def = def->instr->srcs[pred_idx]->def; + if (!def) + return NULL; + } + + struct hash_entry *entry = _mesa_hash_table_search(state->remap, def); + if (entry) + return entry->data; + else + return NULL; +} + +static bool +is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block) +{ + for (unsigned i = 0; i < block->predecessors_count; i++) { + if (!is_live_in_pred(ctx, def, block, i)) + return false; + } + + return true; +} + +static void +spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block) +{ + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + + if (!state->visited) + continue; + + struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i); + if (pred_def) { + spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred); + } + } +} + +static void +spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + bool all_preds_visited = true; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + if (!state->visited) { + all_preds_visited = false; + break; + } + } + + /* Note: in the paper they explicitly spill live-through values first, but we + * should be doing that automatically by virtue of picking the largest + * distance due to the extra distance added to edges out of loops. + * + * TODO: Keep track of pressure in each block and preemptively spill + * live-through values as described in the paper to avoid spilling them + * inside the loop. + */ + + if (ctx->cur_pressure.half > ctx->limit_pressure.half) { + rb_tree_foreach_safe (struct ra_spill_interval, interval, + &ctx->half_live_intervals, half_node) { + if (all_preds_visited && + is_live_in_all_preds(ctx, interval->interval.reg, block)) + continue; + spill_live_in(ctx, interval->interval.reg, block); + ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval); + if (ctx->cur_pressure.half <= ctx->limit_pressure.half) + break; + } + } + + if (ctx->cur_pressure.full > ctx->limit_pressure.full) { + rb_tree_foreach_safe (struct ra_spill_interval, interval, + &ctx->full_live_intervals, node) { + if (all_preds_visited && + is_live_in_all_preds(ctx, interval->interval.reg, block)) + continue; + spill_live_in(ctx, interval->interval.reg, block); + ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval); + if (ctx->cur_pressure.full <= ctx->limit_pressure.full) + break; + } + } +} + +static void +live_in_rewrite(struct ra_spill_ctx *ctx, + struct ra_spill_interval *interval, + struct reg_or_immed *new_val, + struct ir3_block *block, unsigned pred_idx) +{ + struct ir3_block *pred = block->predecessors[pred_idx]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + struct ir3_register *def = interval->interval.reg; + if (is_live_in_phi(def, block)) { + def = def->instr->srcs[pred_idx]->def; + } + + if (def) + _mesa_hash_table_insert(state->remap, def, new_val); + + rb_tree_foreach (struct ra_spill_interval, child, + &interval->interval.children, interval.node) { + assert(new_val->flags & IR3_REG_SSA); + struct ir3_register *child_def = + extract(new_val->def, + (child->interval.reg->interval_start - def->interval_start) / + reg_elem_size(def), reg_elems(child->interval.reg), + NULL, pred); + struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed); + child_val->def = child_def; + child_val->flags = child_def->flags; + live_in_rewrite(ctx, child, child_val, block, pred_idx); + } +} + +static void +reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block) +{ + struct ra_spill_interval *interval = ctx->intervals[def->name]; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + if (!state->visited) + continue; + + if (is_live_in_undef(def, block, i)) + continue; + + struct reg_or_immed *new_val = read_live_in(ctx, def, block, i); + + if (!new_val) { + new_val = ralloc(ctx, struct reg_or_immed); + new_val->def = reload(ctx, def, NULL, pred); + new_val->flags = new_val->def->flags; + } + live_in_rewrite(ctx, interval, new_val, block, i); + } +} + +static void +reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals, + interval.node) { + reload_live_in(ctx, interval->interval.reg, block); + } +} + +static void +add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block) +{ + struct ra_spill_interval *interval = ctx->intervals[def->name]; + if (!interval->interval.inserted) + return; + + bool needs_phi = false; + struct ir3_register *cur_def = NULL; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + + if (!state->visited) { + needs_phi = true; + break; + } + + struct hash_entry *entry = + _mesa_hash_table_search(state->remap, def); + assert(entry); + struct reg_or_immed *pred_val = entry->data; + if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) || + !pred_val->def || + (cur_def && cur_def != pred_val->def)) { + needs_phi = true; + break; + } + cur_def = pred_val->def; + } + + if (!needs_phi) { + interval->dst.def = cur_def; + interval->dst.flags = cur_def->flags; + return; + } + + struct ir3_instruction *phi = + ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count); + struct ir3_register *dst = __ssa_dst(phi); + dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY); + dst->size = def->size; + dst->wrmask = def->wrmask; + + dst->interval_start = def->interval_start; + dst->interval_end = def->interval_end; + dst->merge_set = def->merge_set; + dst->merge_set_offset = def->merge_set_offset; + + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags); + src->size = def->size; + src->wrmask = def->wrmask; + + if (state->visited) { + struct hash_entry *entry = + _mesa_hash_table_search(state->remap, def); + assert(entry); + struct reg_or_immed *new_val = entry->data; + set_src_val(src, new_val); + } else { + src->def = def; + } + } + + interval->dst.def = dst; + interval->dst.flags = dst->flags; + + ir3_instr_move_before_block(phi, block); +} + +/* When spilling a block with a single predecessors, the pred may have other + * successors so we can't choose what's live in and we can't spill/restore + * anything. Just make the inserted intervals exactly match the predecessor. If + * it wasn't live in the predecessor then it must've already been spilled. Also, + * there are no phi nodes and no live-ins. + */ +static void +spill_single_pred_live_in(struct ra_spill_ctx *ctx, + struct ir3_block *block) +{ + unsigned name; + BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], + ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + struct ra_spill_interval *interval = ctx->intervals[reg->name]; + struct reg_or_immed *val = read_live_in(ctx, reg, block, 0); + if (val) + interval->dst = *val; + else + ra_spill_ctx_remove(ctx, interval); + } +} + +static void +rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi, + struct ir3_block *block) +{ + if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) { + phi->flags |= IR3_INSTR_UNUSED; + return; + } + + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + + if (!state->visited) + continue; + + struct ir3_register *src = phi->srcs[i]; + if (!src->def) + continue; + + struct hash_entry *entry = + _mesa_hash_table_search(state->remap, src->def); + assert(entry); + struct reg_or_immed *new_val = entry->data; + set_src_val(src, new_val); + } +} + +static void +spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval, + struct ir3_block *block) +{ + struct ir3_register *def = interval->interval.reg; + + spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block); + ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval); +} + +static void +spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + struct ra_spill_block_state *state = &ctx->blocks[block->index]; + rb_tree_foreach_safe (struct ra_spill_interval, interval, + &ctx->reg_ctx.intervals, interval.node) { + if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) { + spill_live_out(ctx, interval, block); + } + } +} + +static void +reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def, + struct ir3_block *block) +{ + struct ra_spill_interval *interval = ctx->intervals[def->name]; + ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval); + + reload_def(ctx, def, NULL, block); +} + +static void +reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + struct ra_spill_block_state *state = &ctx->blocks[block->index]; + unsigned name; + BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + struct ra_spill_interval *interval = ctx->intervals[name]; + if (!interval->interval.inserted) + reload_live_out(ctx, reg, block); + } +} + +static void +update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + assert(!block->successors[1]); + struct ir3_block *succ = block->successors[0]; + unsigned pred_idx = ir3_block_get_pred_index(succ, block); + + foreach_instr (instr, &succ->instr_list) { + if (instr->opc != OPC_META_PHI) + break; + + struct ir3_register *def = instr->srcs[pred_idx]->def; + if (!def) + continue; + + struct ra_spill_interval *interval = ctx->intervals[def->name]; + if (!interval->interval.inserted) + continue; + set_src_val(instr->srcs[pred_idx], &interval->dst); + } +} + +static void +record_pred_live_out(struct ra_spill_ctx *ctx, + struct ra_spill_interval *interval, + struct ir3_block *block, unsigned pred_idx) +{ + struct ir3_block *pred = block->predecessors[pred_idx]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + + struct ir3_register *def = interval->interval.reg; + if (is_live_in_phi(def, block)) { + def = def->instr->srcs[pred_idx]->def; + } + BITSET_SET(state->live_out, def->name); + + rb_tree_foreach (struct ra_spill_interval, child, + &interval->interval.children, interval.node) { + record_pred_live_out(ctx, child, block, pred_idx); + } +} + +static void +record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + struct ra_spill_block_state *state = &ctx->blocks[pred->index]; + if (state->visited) + continue; + + state->live_out = rzalloc_array(ctx, BITSET_WORD, + BITSET_WORDS(ctx->live->definitions_count)); + + + rb_tree_foreach (struct ra_spill_interval, interval, + &ctx->reg_ctx.intervals, interval.node) { + record_pred_live_out(ctx, interval, block, i); + } + } +} + +static void +record_live_out(struct ra_spill_ctx *ctx, + struct ra_spill_block_state *state, + struct ra_spill_interval *interval) +{ + if (!(interval->dst.flags & IR3_REG_SSA) || + interval->dst.def) { + struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed); + *val = interval->dst; + _mesa_hash_table_insert(state->remap, interval->interval.reg, val); + } + rb_tree_foreach (struct ra_spill_interval, child, + &interval->interval.children, interval.node) { + record_live_out(ctx, state, child); + } +} + +static void +record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block) +{ + struct ra_spill_block_state *state = &ctx->blocks[block->index]; + state->remap = _mesa_pointer_hash_table_create(ctx); + + rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals, + interval.node) { + record_live_out(ctx, state, interval); + } } static void @@ -312,12 +1692,14 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block) { memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure)); rb_tree_init(&ctx->reg_ctx.intervals); + rb_tree_init(&ctx->full_live_intervals); + rb_tree_init(&ctx->half_live_intervals); unsigned name; BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], ctx->live->definitions_count) { struct ir3_register *reg = ctx->live->definitions[name]; - handle_live_in(ctx, reg); + handle_live_in(ctx, block, reg); } foreach_instr (instr, &block->instr_list) { @@ -327,36 +1709,297 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block) handle_input_phi(ctx, instr); } - update_max_pressure(ctx); + if (ctx->spilling) { + if (block->predecessors_count == 1) { + spill_single_pred_live_in(ctx, block); + } else { + spill_live_ins(ctx, block); + reload_live_ins(ctx, block); + record_pred_live_outs(ctx, block); + foreach_instr (instr, &block->instr_list) { + if (instr->opc != OPC_META_PHI) + break; + rewrite_phi(ctx, instr, block); + } + BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], + ctx->live->definitions_count) { + struct ir3_register *reg = ctx->live->definitions[name]; + add_live_in_phi(ctx, reg, block); + } + } + } else { + update_max_pressure(ctx); + } foreach_instr (instr, &block->instr_list) { + di(instr, "processing"); + if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH) remove_input_phi(ctx, instr); + else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY) + handle_pcopy(ctx, instr); + else if (ctx->spilling && instr->opc == OPC_MOV && + instr->dsts[0] == ctx->base_reg) + /* skip */; else handle_instr(ctx, instr); } + + if (ctx->spilling && block->successors[0]) { + struct ra_spill_block_state *state = + &ctx->blocks[block->successors[0]->index]; + if (state->visited) { + assert(!block->successors[1]); + + spill_live_outs(ctx, block); + reload_live_outs(ctx, block); + update_live_out_phis(ctx, block); + } + } + + if (ctx->spilling) { + record_live_outs(ctx, block); + ctx->blocks[block->index].visited = true; + } +} + +static bool +simplify_phi_node(struct ir3_instruction *phi) +{ + struct ir3_register *def = NULL; + foreach_src (src, phi) { + /* Ignore phi sources which point to the phi itself. */ + if (src->def == phi->dsts[0]) + continue; + /* If it's undef or it doesn't match the previous sources, bail */ + if (!src->def || (def && def != src->def)) + return false; + def = src->def; + } + + phi->data = def; + phi->flags |= IR3_INSTR_UNUSED; + return true; +} + +static void +simplify_phi_srcs(struct ir3_instruction *instr) +{ + foreach_src (src, instr) { + if (src->def && src->def->instr->opc == OPC_META_PHI) { + struct ir3_instruction *phi = src->def->instr; + if (phi->data) + src->def = phi->data; + } + } +} + +/* We insert phi nodes for all live-ins of loops in case we need to split the + * live range. This pass cleans that up for the case where the live range didn't + * actually need to be split. + */ +static void +simplify_phi_nodes(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + if (instr->opc != OPC_META_PHI) + break; + instr->data = NULL; + } + } + + bool progress; + do { + progress = false; + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED)) + continue; + + simplify_phi_srcs(instr); + } + + for (unsigned i = 0; i < 2; i++) { + struct ir3_block *succ = block->successors[i]; + if (!succ) + continue; + foreach_instr (instr, &succ->instr_list) { + if (instr->opc != OPC_META_PHI) + break; + if (instr->flags & IR3_INSTR_UNUSED) + continue; + + simplify_phi_srcs(instr); + progress |= simplify_phi_node(instr); + } + } + } + } while (progress); +} + +static void +unmark_dead(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + instr->flags &= ~IR3_INSTR_UNUSED; + } + } +} + +/* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark + * which ones are dead along the way, so there's nothing to compute here. + */ +static void +cleanup_dead(struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr_safe (instr, &block->instr_list) { + if (instr->flags & IR3_INSTR_UNUSED) + list_delinit(&instr->node); + } + } +} + +/* Deal with merge sets after spilling. Spilling generally leaves the merge sets + * in a mess, and even if we properly cleaned up after ourselves, we would want + * to recompute the merge sets afterward anway. That's because + * spilling/reloading can "break up" phi webs and split/collect webs so that + * allocating them to the same register no longer gives any benefit. For + * example, imagine we have this: + * + * if (...) { + * foo = ... + * } else { + * bar = ... + * } + * baz = phi(foo, bar) + * + * and we spill "baz": + * + * if (...) { + * foo = ... + * spill(foo) + * } else { + * bar = ... + * spill(bar) + * } + * baz = reload() + * + * now foo, bar, and baz don't have to be allocated to the same register. How + * exactly the merge sets change can be complicated, so it's easier just to + * recompute them. + * + * However, there's a wrinkle in this: those same merge sets determine the + * register pressure, due to multiple values inhabiting the same register! And + * we assume that this sharing happens when spilling. Therefore we need a + * three-step procedure: + * + * 1. Drop the original merge sets. + * 2. Calculate which values *must* be merged, being careful to only use the + * interval information which isn't trashed by spilling, and forcibly merge + * them. + * 3. Let ir3_merge_regs() finish the job, including recalculating the + * intervals. + */ + +static void +fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir) +{ + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + ra_foreach_dst (dst, instr) { + dst->merge_set = NULL; + dst->merge_set_offset = 0; + } + } + } + + foreach_block (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + if (instr->opc != OPC_META_SPLIT && + instr->opc != OPC_META_COLLECT) + continue; + + struct ir3_register *dst = instr->dsts[0]; + ra_foreach_src (src, instr) { + if (!(src->flags & IR3_REG_KILL) && + src->def->interval_start < dst->interval_end && + dst->interval_start < src->def->interval_end) { + ir3_force_merge(dst, src->def, + src->def->interval_start - dst->interval_start); + } + } + } + } + + ir3_merge_regs(live, ir); } void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live, struct ir3_pressure *max_pressure) { - struct ra_spill_ctx ctx = {}; - ctx.live = live; - ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals)); - ctx.compiler = v->shader->compiler; - spill_ctx_init(&ctx); + struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx); + spill_ctx_init(ctx, v, live); foreach_block (block, &v->ir->block_list) { - handle_block(&ctx, block); + handle_block(ctx, block); } - assert(ctx.cur_pressure.full == 0); - assert(ctx.cur_pressure.half == 0); - assert(ctx.cur_pressure.shared == 0); + assert(ctx->cur_pressure.full == 0); + assert(ctx->cur_pressure.half == 0); + assert(ctx->cur_pressure.shared == 0); - free(ctx.intervals); - - *max_pressure = ctx.max_pressure; + *max_pressure = ctx->max_pressure; + ralloc_free(ctx); +} + +bool +ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v, + struct ir3_liveness **live, + const struct ir3_pressure *limit_pressure) +{ + struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx); + spill_ctx_init(ctx, v, *live); + + ctx->spilling = true; + + ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state, + ctx->live->block_count); + rb_tree_init(&ctx->full_live_intervals); + rb_tree_init(&ctx->half_live_intervals); + + ctx->limit_pressure = *limit_pressure; + ctx->spill_slot = v->pvtmem_size; + + add_base_reg(ctx, ir); + compute_next_distance(ctx, ir); + + unmark_dead(ir); + + foreach_block (block, &ir->block_list) { + handle_block(ctx, block); + } + + simplify_phi_nodes(ir); + + cleanup_dead(ir); + + ir3_create_parallel_copies(ir); + + /* After this point, we're done mutating the IR. Liveness has been trashed, + * so recalculate it. We'll need it for recalculating the merge sets. + */ + ralloc_free(ctx->live); + *live = ir3_calc_liveness(v); + + fixup_merge_sets(*live, ir); + + v->pvtmem_size = ctx->spill_slot; + ralloc_free(ctx); + + return true; } diff --git a/src/freedreno/ir3/ir3_validate.c b/src/freedreno/ir3/ir3_validate.c index 4fe56b45c9c..08f2df4251a 100644 --- a/src/freedreno/ir3/ir3_validate.c +++ b/src/freedreno/ir3/ir3_validate.c @@ -187,7 +187,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) /* end/chmask/etc are allowed to have different size sources */ } else if (instr->opc == OPC_META_PARALLEL_COPY) { /* pcopy sources have to match with their destination but can have - * different size. + * different sizes from each other. */ } else if (n > 0) { validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) == @@ -303,6 +303,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr) case OPC_STL: case OPC_STP: case OPC_STLW: + case OPC_SPILL_MACRO: validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF)); validate_reg_size(ctx, instr->srcs[1], instr->cat6.type); validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF)); diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 40bdb26194d..0456bc59253 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -88,6 +88,7 @@ libfreedreno_ir3_files = files( 'ir3_legalize.c', 'ir3_liveness.c', 'ir3_lower_parallelcopy.c', + 'ir3_lower_spill.c', 'ir3_lower_subgroups.c', 'ir3_merge_regs.c', 'ir3_nir.c', diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt index 61c1a5b0cda..d311f70ae3b 100644 --- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt +++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt @@ -345,7 +345,6 @@ spec@glsl-1.50@execution@compatibility@vs-gs-ff-frag,Crash spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array,Crash spec@glsl-1.50@execution@geometry@end-primitive 0,Fail -spec@glsl-1.50@execution@geometry@max-input-components,Fail spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop ffs,Fail spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop other,Fail spec@glsl-1.50@execution@geometry@primitive-id-restart gl_lines_adjacency ffs,Fail @@ -385,11 +384,7 @@ spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triang spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip other,Fail spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail -spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail -spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail -spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash -spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail spec@glsl-1.50@gs-max-output-components,Fail spec@intel_performance_query@intel_performance_query-issue_2235,Fail spec@khr_texture_compression_astc@array-gl@12x12 Block Dim,Fail