From 941739619e1f7841aee8048e6a046377b39b19b0 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 1 Mar 2021 10:43:01 +0000 Subject: [PATCH] Revert "radv,aco: allow unaligned LDS access on GFX9+" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1a0b0e8460c1881f94975b3dfbe9c312d9c3fcf7. The bounds checking behaviour of ds_read_b64, ds_read_b96 and ds_read_b128 make this feature very difficult to use safely. This fixes a blocking artifact in Hitman 2. Previously, it contained: ds_read_b64(local_invocation_index() * 4 - 4) For local_invocation_index()=0, the second dword would be considered out-of-bounds, even though it's at offset 0. Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 24 ++++++++----------- src/amd/compiler/aco_ir.cpp | 4 ---- src/amd/compiler/aco_ir.h | 1 - src/amd/vulkan/radv_pipeline.c | 8 ++----- 4 files changed, 12 insertions(+), 25 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 2fcfdaa2b11..3e49c88ea64 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3466,35 +3466,31 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo &info, bool large_ds_read = bld.program->chip_class >= GFX7; bool usable_read2 = bld.program->chip_class >= GFX7; - bool aligned2 = align % 2 == 0; - bool aligned4 = align % 4 == 0; - bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 8 == 0); - bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (align % 16 == 0); - bool read2 = false; unsigned size = 0; aco_opcode op; - if (bytes_needed >= 16 && aligned16 && large_ds_read) { + //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial + if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) { size = 16; op = aco_opcode::ds_read_b128; - } else if (bytes_needed >= 16 && aligned8 && const_offset % 8 == 0 && usable_read2) { + } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) { size = 16; read2 = true; op = aco_opcode::ds_read2_b64; - } else if (bytes_needed >= 12 && aligned16 && large_ds_read) { + } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) { size = 12; op = aco_opcode::ds_read_b96; - } else if (bytes_needed >= 8 && aligned8) { + } else if (bytes_needed >= 8 && align % 8 == 0) { size = 8; op = aco_opcode::ds_read_b64; - } else if (bytes_needed >= 8 && aligned4 && const_offset % 4 == 0) { + } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) { size = 8; read2 = true; op = aco_opcode::ds_read2_b32; - } else if (bytes_needed >= 4 && aligned4) { + } else if (bytes_needed >= 4 && align % 4 == 0) { size = 4; op = aco_opcode::ds_read_b32; - } else if (bytes_needed >= 2 && aligned2) { + } else if (bytes_needed >= 2 && align % 2 == 0) { size = 2; op = aco_opcode::ds_read_u16; } else { @@ -3858,8 +3854,8 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t bool aligned2 = offset % 2 == 0 && align % 2 == 0; bool aligned4 = offset % 4 == 0 && align % 4 == 0; - bool aligned8 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 8 == 0 && align % 8 == 0); - bool aligned16 = bld.program->dev.has_unaligned_lds_access ? aligned4 : (offset % 16 == 0 && align % 16 == 0); + bool aligned8 = offset % 8 == 0 && align % 8 == 0; + bool aligned16 = offset % 16 == 0 && align % 16 == 0; //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial aco_opcode op = aco_opcode::num_opcodes; diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 4e248a9bd33..3525433dfa5 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -98,10 +98,6 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768; /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */ program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY; - /* GFX10 WGP has a bug making naturally aligned access required. The LLVM - * subtarget feature is called "FeatureLdsMisalignedBug". - */ - program->dev.has_unaligned_lds_access = chip_class >= GFX9 && !(chip_class == GFX10 && wgp_mode); program->dev.vgpr_limit = 256; program->dev.physical_vgprs = 256; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index f44dacdd6c3..548e71b5397 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1799,7 +1799,6 @@ struct DeviceInfo { uint16_t lds_alloc_granule; uint32_t lds_limit; /* in bytes */ bool has_16bank_lds; - bool has_unaligned_lds_access; uint16_t physical_sgprs; uint16_t physical_vgprs; uint16_t vgpr_limit; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 907fa4e396f..3e8bfbe9905 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3053,9 +3053,6 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { - struct radv_device *device = data; - enum chip_class chip = device->physical_device->rad_info.chip_class; - if (num_components > 4) return false; @@ -3084,9 +3081,9 @@ mem_vectorize_callback(unsigned align_mul, unsigned align_offset, FALLTHROUGH; case nir_intrinsic_load_shared: case nir_intrinsic_store_shared: - if (chip < GFX9 && bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment on GFX6-8 and are split otherwise */ + if (bit_size * num_components == 96) /* 96 bit loads require 128 bit alignment and are split otherwise */ return align % 16 == 0; - else if (chip < GFX9 && bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment on GFX6-8 and are split otherwise */ + else if (bit_size * num_components == 128) /* 128 bit loads require 64 bit alignment and are split otherwise */ return align % 8 == 0; else return align % (bit_size == 8 ? 2 : 4) == 0; @@ -3335,7 +3332,6 @@ VkResult radv_create_shaders(struct radv_pipeline *pipeline, nir_var_mem_push_const | nir_var_mem_shared | nir_var_mem_global, .callback = mem_vectorize_callback, - .cb_data = device, .robust_modes = 0, };