diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index 615644c1dd4..88af6084208 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -2344,6 +2344,228 @@ ac_nir_lower_bit_size_callback(const nir_instr *instr, void *data) return 0; } +static unsigned +align_load_store_size(enum amd_gfx_level gfx_level, unsigned size, bool uses_smem, bool is_shared) +{ + /* LDS can't overfetch because accesses that are partially out of range would be dropped + * entirely, so all unaligned LDS accesses are always split. + */ + if (is_shared) + return size; + + /* Align the size to what the hw supports. Out of range access due to alignment is OK because + * range checking is per dword for untyped instructions. This assumes that the compiler backend + * overfetches due to load size alignment instead of splitting the load. + * + * GFX6-11 don't have 96-bit SMEM loads. + * GFX6 doesn't have 96-bit untyped VMEM loads. + */ + if (gfx_level >= (uses_smem ? GFX12 : GFX7) && size == 96) + return size; + else + return util_next_power_of_two(size); +} + +bool +ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, + unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low, + nir_intrinsic_instr *high, void *data) +{ + struct ac_nir_config *config = (struct ac_nir_config *)data; + bool uses_smem = (nir_intrinsic_has_access(low) && + nir_intrinsic_access(low) & ACCESS_SMEM_AMD) || + /* These don't have the "access" field. */ + low->intrinsic == nir_intrinsic_load_smem_amd || + low->intrinsic == nir_intrinsic_load_push_constant; + bool is_store = !nir_intrinsic_infos[low->intrinsic].has_dest; + bool is_scratch = low->intrinsic == nir_intrinsic_load_stack || + low->intrinsic == nir_intrinsic_store_stack || + low->intrinsic == nir_intrinsic_load_scratch || + low->intrinsic == nir_intrinsic_store_scratch; + bool is_shared = low->intrinsic == nir_intrinsic_load_shared || + low->intrinsic == nir_intrinsic_store_shared || + low->intrinsic == nir_intrinsic_load_deref || + low->intrinsic == nir_intrinsic_store_deref; + + assert(!is_store || hole_size <= 0); + + /* If we get derefs here, only shared memory derefs are expected. */ + assert((low->intrinsic != nir_intrinsic_load_deref && + low->intrinsic != nir_intrinsic_store_deref) || + nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared)); + + /* Don't vectorize descriptor loads for LLVM due to excessive SGPR and VGPR spilling. */ + if (!config->uses_aco && low->intrinsic == nir_intrinsic_load_smem_amd) + return false; + + /* Reject opcodes we don't vectorize. */ + switch (low->intrinsic) { + case nir_intrinsic_load_smem_amd: + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_stack: + case nir_intrinsic_store_stack: + case nir_intrinsic_load_scratch: + case nir_intrinsic_store_scratch: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_global: + case nir_intrinsic_store_global: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_load_deref: + case nir_intrinsic_store_deref: + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + break; + default: + return false; + } + + /* Align the size to what the hw supports. */ + unsigned unaligned_new_size = num_components * bit_size; + unsigned aligned_new_size = align_load_store_size(config->gfx_level, unaligned_new_size, + uses_smem, is_shared); + + if (uses_smem) { + /* Maximize SMEM vectorization except for LLVM, which suffers from SGPR and VGPR spilling. + * GFX6-7 have fewer hw SGPRs, so merge only up to 128 bits to limit SGPR usage. + */ + if (aligned_new_size > (config->gfx_level >= GFX8 ? (config->uses_aco ? 512 : 256) : 128)) + return false; + } else { + if (aligned_new_size > 128) + return false; + + /* GFX6-8 only support 32-bit scratch loads/stores. */ + if (config->gfx_level <= GFX8 && is_scratch && aligned_new_size > 32) + return false; + } + + if (!is_store) { + /* Non-descriptor loads. */ + if (low->intrinsic != nir_intrinsic_load_ubo && + low->intrinsic != nir_intrinsic_load_ssbo) { + /* Only increase the size of loads if doing so doesn't extend into a new page. + * Here we set alignment to MAX because we don't know the alignment of global + * pointers before adding the offset. + */ + uint32_t resource_align = low->intrinsic == nir_intrinsic_load_global_constant || + low->intrinsic == nir_intrinsic_load_global ? NIR_ALIGN_MUL_MAX : 4; + uint32_t page_size = 4096; + uint32_t mul = MIN3(align_mul, page_size, resource_align); + unsigned end = (align_offset + unaligned_new_size / 8u) & (mul - 1); + if ((aligned_new_size - unaligned_new_size) / 8u > (mul - end)) + return false; + } + + /* Only allow SMEM loads to overfetch by 32 bits: + * + * Examples (the hole is indicated by parentheses, the numbers are in bytes, the maximum + * overfetch size is 4): + * 4 | (4) | 4 -> hw loads 12 : ALLOWED (4 over) + * 4 | (4) | 4 -> hw loads 16 : DISALLOWED (8 over) + * 4 | 4 | 4 -> hw loads 16 : ALLOWED (4 over) + * 4 | (4) | 8 -> hw loads 16 : ALLOWED (4 over) + * 16 | 4 -> hw loads 32 : DISALLOWED (12 over) + * 16 | 8 -> hw loads 32 : DISALLOWED (8 over) + * 16 | 12 -> hw loads 32 : ALLOWED (4 over) + * 16 | (4) | 12 -> hw loads 32 : ALLOWED (4 over) + * 32 | 16 -> hw loads 64 : DISALLOWED (16 over) + * 32 | 28 -> hw loads 64 : ALLOWED (4 over) + * 32 | (4) | 28 -> hw loads 64 : ALLOWED (4 over) + * + * Note that we can overfetch by more than 4 bytes if we merge more than 2 loads, e.g.: + * 4 | (4) | 8 | (4) | 12 -> hw loads 32 : ALLOWED (4 + 4 over) + * + * That's because this callback is called twice in that case, each time allowing only 4 over. + * + * This is only enabled for ACO. LLVM spills SGPRs and VGPRs too much. + */ + unsigned overfetch_size = 0; + + if (config->uses_aco && uses_smem && aligned_new_size >= 128) + overfetch_size = 32; + + int64_t aligned_unvectorized_size = + align_load_store_size(config->gfx_level, low->num_components * low->def.bit_size, + uses_smem, is_shared) + + align_load_store_size(config->gfx_level, high->num_components * high->def.bit_size, + uses_smem, is_shared); + + if (aligned_new_size > aligned_unvectorized_size + overfetch_size) + return false; + } + + uint32_t align; + if (align_offset) + align = 1 << (ffs(align_offset) - 1); + else + align = align_mul; + + /* Validate the alignment and number of components. */ + if (!is_shared) { + unsigned max_components; + if (align % 4 == 0) + max_components = NIR_MAX_VEC_COMPONENTS; + else if (align % 2 == 0) + max_components = 16u / bit_size; + else + max_components = 8u / bit_size; + return (align % (bit_size / 8u)) == 0 && num_components <= max_components; + } else { + if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */ + return align % 16 == 0; + } else if (bit_size == 16 && (align % 4)) { + /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU + * vectorization, because our vectorizer requires the scalar IR to already contain vectors. + */ + return (align % 2 == 0) && num_components <= 2; + } else { + if (num_components == 3) { + /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */ + return false; + } + unsigned req = bit_size * num_components; + if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */ + req /= 2u; + return align % (req / 8u) == 0; + } + } + return false; +} + +bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data) +{ + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + /* Reject opcodes we don't scalarize. */ + switch (intr->intrinsic) { + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_shared: + break; + default: + return false; + } + + bool uses_smem = nir_intrinsic_has_access(intr) && + nir_intrinsic_access(intr) & ACCESS_SMEM_AMD; + bool is_shared = intr->intrinsic == nir_intrinsic_load_shared; + + enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data; + unsigned comp_size = intr->def.bit_size / 8; + unsigned load_size = intr->def.num_components * comp_size; + unsigned used_load_size = util_bitcount(nir_def_components_read(&intr->def)) * comp_size; + + /* Scalarize if the load overfetches. That includes loads that overfetch due to load size + * alignment, e.g. when only a power-of-two load is available. The scalarized loads are expected + * to be later vectorized to optimal sizes. + */ + return used_load_size < align_load_store_size(gfx_level, load_size, uses_smem, is_shared); +} + /* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC) * from a NIR memory intrinsic. */ diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h index 3602e5761a8..bd41b3c8b67 100644 --- a/src/amd/common/ac_nir.h +++ b/src/amd/common/ac_nir.h @@ -372,6 +372,14 @@ ac_nir_optimize_uniform_atomics(nir_shader *nir); unsigned ac_nir_lower_bit_size_callback(const nir_instr *instr, void *data); +bool +ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, + unsigned num_components, int64_t hole_size, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); + +bool +ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data); + enum gl_access_qualifier ac_nir_get_mem_access_flags(const nir_intrinsic_instr *instr); diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index 532246d574f..91810359008 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -114,228 +114,6 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm, BITFIELD_BIT(nir_lower_packing_op_unpack_32_4x8); } -static unsigned -align_load_store_size(enum amd_gfx_level gfx_level, unsigned size, bool uses_smem, bool is_shared) -{ - /* LDS can't overfetch because accesses that are partially out of range would be dropped - * entirely, so all unaligned LDS accesses are always split. - */ - if (is_shared) - return size; - - /* Align the size to what the hw supports. Out of range access due to alignment is OK because - * range checking is per dword for untyped instructions. This assumes that the compiler backend - * overfetches due to load size alignment instead of splitting the load. - * - * GFX6-11 don't have 96-bit SMEM loads. - * GFX6 doesn't have 96-bit untyped VMEM loads. - */ - if (gfx_level >= (uses_smem ? GFX12 : GFX7) && size == 96) - return size; - else - return util_next_power_of_two(size); -} - -bool -ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low, - nir_intrinsic_instr *high, void *data) -{ - struct ac_nir_config *config = (struct ac_nir_config *)data; - bool uses_smem = (nir_intrinsic_has_access(low) && - nir_intrinsic_access(low) & ACCESS_SMEM_AMD) || - /* These don't have the "access" field. */ - low->intrinsic == nir_intrinsic_load_smem_amd || - low->intrinsic == nir_intrinsic_load_push_constant; - bool is_store = !nir_intrinsic_infos[low->intrinsic].has_dest; - bool is_scratch = low->intrinsic == nir_intrinsic_load_stack || - low->intrinsic == nir_intrinsic_store_stack || - low->intrinsic == nir_intrinsic_load_scratch || - low->intrinsic == nir_intrinsic_store_scratch; - bool is_shared = low->intrinsic == nir_intrinsic_load_shared || - low->intrinsic == nir_intrinsic_store_shared || - low->intrinsic == nir_intrinsic_load_deref || - low->intrinsic == nir_intrinsic_store_deref; - - assert(!is_store || hole_size <= 0); - - /* If we get derefs here, only shared memory derefs are expected. */ - assert((low->intrinsic != nir_intrinsic_load_deref && - low->intrinsic != nir_intrinsic_store_deref) || - nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared)); - - /* Don't vectorize descriptor loads for LLVM due to excessive SGPR and VGPR spilling. */ - if (!config->uses_aco && low->intrinsic == nir_intrinsic_load_smem_amd) - return false; - - /* Reject opcodes we don't vectorize. */ - switch (low->intrinsic) { - case nir_intrinsic_load_smem_amd: - case nir_intrinsic_load_push_constant: - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_stack: - case nir_intrinsic_store_stack: - case nir_intrinsic_load_scratch: - case nir_intrinsic_store_scratch: - case nir_intrinsic_load_global_constant: - case nir_intrinsic_load_global: - case nir_intrinsic_store_global: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_store_ssbo: - case nir_intrinsic_load_deref: - case nir_intrinsic_store_deref: - case nir_intrinsic_load_shared: - case nir_intrinsic_store_shared: - break; - default: - return false; - } - - /* Align the size to what the hw supports. */ - unsigned unaligned_new_size = num_components * bit_size; - unsigned aligned_new_size = align_load_store_size(config->gfx_level, unaligned_new_size, - uses_smem, is_shared); - - if (uses_smem) { - /* Maximize SMEM vectorization except for LLVM, which suffers from SGPR and VGPR spilling. - * GFX6-7 have fewer hw SGPRs, so merge only up to 128 bits to limit SGPR usage. - */ - if (aligned_new_size > (config->gfx_level >= GFX8 ? (config->uses_aco ? 512 : 256) : 128)) - return false; - } else { - if (aligned_new_size > 128) - return false; - - /* GFX6-8 only support 32-bit scratch loads/stores. */ - if (config->gfx_level <= GFX8 && is_scratch && aligned_new_size > 32) - return false; - } - - if (!is_store) { - /* Non-descriptor loads. */ - if (low->intrinsic != nir_intrinsic_load_ubo && - low->intrinsic != nir_intrinsic_load_ssbo) { - /* Only increase the size of loads if doing so doesn't extend into a new page. - * Here we set alignment to MAX because we don't know the alignment of global - * pointers before adding the offset. - */ - uint32_t resource_align = low->intrinsic == nir_intrinsic_load_global_constant || - low->intrinsic == nir_intrinsic_load_global ? NIR_ALIGN_MUL_MAX : 4; - uint32_t page_size = 4096; - uint32_t mul = MIN3(align_mul, page_size, resource_align); - unsigned end = (align_offset + unaligned_new_size / 8u) & (mul - 1); - if ((aligned_new_size - unaligned_new_size) / 8u > (mul - end)) - return false; - } - - /* Only allow SMEM loads to overfetch by 32 bits: - * - * Examples (the hole is indicated by parentheses, the numbers are in bytes, the maximum - * overfetch size is 4): - * 4 | (4) | 4 -> hw loads 12 : ALLOWED (4 over) - * 4 | (4) | 4 -> hw loads 16 : DISALLOWED (8 over) - * 4 | 4 | 4 -> hw loads 16 : ALLOWED (4 over) - * 4 | (4) | 8 -> hw loads 16 : ALLOWED (4 over) - * 16 | 4 -> hw loads 32 : DISALLOWED (12 over) - * 16 | 8 -> hw loads 32 : DISALLOWED (8 over) - * 16 | 12 -> hw loads 32 : ALLOWED (4 over) - * 16 | (4) | 12 -> hw loads 32 : ALLOWED (4 over) - * 32 | 16 -> hw loads 64 : DISALLOWED (16 over) - * 32 | 28 -> hw loads 64 : ALLOWED (4 over) - * 32 | (4) | 28 -> hw loads 64 : ALLOWED (4 over) - * - * Note that we can overfetch by more than 4 bytes if we merge more than 2 loads, e.g.: - * 4 | (4) | 8 | (4) | 12 -> hw loads 32 : ALLOWED (4 + 4 over) - * - * That's because this callback is called twice in that case, each time allowing only 4 over. - * - * This is only enabled for ACO. LLVM spills SGPRs and VGPRs too much. - */ - unsigned overfetch_size = 0; - - if (config->uses_aco && uses_smem && aligned_new_size >= 128) - overfetch_size = 32; - - int64_t aligned_unvectorized_size = - align_load_store_size(config->gfx_level, low->num_components * low->def.bit_size, - uses_smem, is_shared) + - align_load_store_size(config->gfx_level, high->num_components * high->def.bit_size, - uses_smem, is_shared); - - if (aligned_new_size > aligned_unvectorized_size + overfetch_size) - return false; - } - - uint32_t align; - if (align_offset) - align = 1 << (ffs(align_offset) - 1); - else - align = align_mul; - - /* Validate the alignment and number of components. */ - if (!is_shared) { - unsigned max_components; - if (align % 4 == 0) - max_components = NIR_MAX_VEC_COMPONENTS; - else if (align % 2 == 0) - max_components = 16u / bit_size; - else - max_components = 8u / bit_size; - return (align % (bit_size / 8u)) == 0 && num_components <= max_components; - } else { - if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */ - return align % 16 == 0; - } else if (bit_size == 16 && (align % 4)) { - /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU - * vectorization, because our vectorizer requires the scalar IR to already contain vectors. - */ - return (align % 2 == 0) && num_components <= 2; - } else { - if (num_components == 3) { - /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */ - return false; - } - unsigned req = bit_size * num_components; - if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */ - req /= 2u; - return align % (req / 8u) == 0; - } - } - return false; -} - -bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data) -{ - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - /* Reject opcodes we don't scalarize. */ - switch (intr->intrinsic) { - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - case nir_intrinsic_load_shared: - break; - default: - return false; - } - - bool uses_smem = nir_intrinsic_has_access(intr) && - nir_intrinsic_access(intr) & ACCESS_SMEM_AMD; - bool is_shared = intr->intrinsic == nir_intrinsic_load_shared; - - enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data; - unsigned comp_size = intr->def.bit_size / 8; - unsigned load_size = intr->def.num_components * comp_size; - unsigned used_load_size = util_bitcount(nir_def_components_read(&intr->def)) * comp_size; - - /* Scalarize if the load overfetches. That includes loads that overfetch due to load size - * alignment, e.g. when only a power-of-two load is available. The scalarized loads are expected - * to be later vectorized to optimal sizes. - */ - return used_load_size < align_load_store_size(gfx_level, load_size, uses_smem, is_shared); -} - unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, bool writes_mrt0_alpha) { diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index 5008f947a80..ceb14e76ca8 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -249,12 +249,6 @@ struct ac_nir_config { void ac_set_nir_options(struct radeon_info *info, bool use_llvm, nir_shader_compiler_options *options); -bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, int64_t hole_size, - nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); - -bool ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data); - unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, bool writes_mrt0_alpha); diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c index 07925c58932..f6a41a10403 100644 --- a/src/amd/vulkan/radv_pipeline_rt.c +++ b/src/amd/vulkan/radv_pipeline_rt.c @@ -19,6 +19,7 @@ #include "radv_pipeline_rt.h" #include "radv_rmv.h" #include "radv_shader.h" +#include "ac_nir.h" struct rt_handle_hash_entry { uint32_t key;