diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 4bae3d2d396..e4b837c561c 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -383,7 +383,8 @@ type_size(const struct glsl_type *type, bool bindless) } bool -mem_vectorize_callback(unsigned align, unsigned bit_size, +mem_vectorize_callback(unsigned align_mul, unsigned align_offset, + unsigned bit_size, unsigned num_components, unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high) { @@ -394,6 +395,12 @@ mem_vectorize_callback(unsigned align, unsigned bit_size, if (bit_size * num_components > 128) return false; + uint32_t align; + if (align_offset) + align = 1 << (ffs(align_offset) - 1); + else + align = align_mul; + switch (low->intrinsic) { case nir_intrinsic_load_global: case nir_intrinsic_store_global: diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 80400dc7996..80d89b7f3f2 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -4851,7 +4851,9 @@ bool nir_opt_vectorize(nir_shader *shader, nir_opt_vectorize_cb filter, bool nir_opt_conditional_discard(nir_shader *shader); -typedef bool (*nir_should_vectorize_mem_func)(unsigned align, unsigned bit_size, +typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul, + unsigned align_offset, + unsigned bit_size, unsigned num_components, unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high); diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 35a650736f9..6ae69031820 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -667,8 +667,9 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size, if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS) return false; - uint32_t align = low->align_offset ? 1 << (ffs(low->align_offset) - 1) : low->align_mul; - if (!ctx->callback(align, new_bit_size, new_num_components, + if (!ctx->callback(low->align_mul, + low->align_offset, + new_bit_size, new_num_components, high_offset, low->intrin, high->intrin)) return false; diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp index 708cbefbabd..dd9d4015836 100644 --- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp +++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp @@ -70,7 +70,8 @@ protected: bool test_alu(nir_instr *instr, nir_op op); bool test_alu_def(nir_instr *instr, unsigned index, nir_ssa_def *def, unsigned swizzle=0); - static bool mem_vectorize_callback(unsigned align, unsigned bit_size, + static bool mem_vectorize_callback(unsigned align_mul, unsigned align_offset, + unsigned bit_size, unsigned num_components, unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high); static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align); @@ -363,7 +364,8 @@ bool nir_load_store_vectorize_test::test_alu_def( } bool nir_load_store_vectorize_test::mem_vectorize_callback( - unsigned align, unsigned bit_size, unsigned num_components, unsigned high_offset, + unsigned align_mul, unsigned align_offset, unsigned bit_size, + unsigned num_components, unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high) { return bit_size / 8; diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 335a8ef854d..c1af534295f 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -855,7 +855,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, } static bool -brw_nir_should_vectorize_mem(unsigned align, unsigned bit_size, +brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, + unsigned bit_size, unsigned num_components, unsigned high_offset, nir_intrinsic_instr *low, nir_intrinsic_instr *high) @@ -873,6 +874,13 @@ brw_nir_should_vectorize_mem(unsigned align, unsigned bit_size, if (num_components > 4) return false; + + uint32_t align; + if (align_offset) + align = 1 << (ffs(align_offset) - 1); + else + align = align_mul; + if (align < bit_size / 8) return false;