From f8aa83f0c86e25c7371b49cf0bb5c69e36b6b3c8 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Wed, 8 Feb 2023 10:46:07 -0600 Subject: [PATCH] intel/nir: Use nir_lower_mem_access_bit_sizes() This drops the Intel-specific pass in favor of the new generic one. No shader-db changes on Skylake or DG2. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_nir.c | 84 +++- .../brw_nir_lower_mem_access_bit_sizes.c | 363 ------------------ src/intel/compiler/meson.build | 1 - 3 files changed, 82 insertions(+), 366 deletions(-) delete mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9365490bea6..76c2ca4b433 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1266,13 +1266,93 @@ bool combine_all_barriers(nir_intrinsic_instr *a, return true; } +static nir_mem_access_size_align +get_mem_access_size_align(nir_intrinsic_op intrin, uint8_t bytes, + uint32_t align_mul, uint32_t align_offset, + bool offset_is_const, const void *cb_data) +{ + assert(align_offset < align_mul); + const uint32_t align = + align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; + + switch (intrin) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_scratch: + /* The offset is constant so we can use a 32-bit load and just shift it + * around as needed. + */ + if (align < 4 && offset_is_const) { + assert(util_is_power_of_two_nonzero(align_mul) && align_mul >= 4); + const unsigned pad = align_offset % 4; + const unsigned comps32 = DIV_ROUND_UP(bytes + pad, 4); + return (nir_mem_access_size_align) { + .bit_size = 32, + .num_components = comps32, + .align_mul = 4, + }; + } + break; + + case nir_intrinsic_load_task_payload: + if (bytes < 4 || align < 4) { + return (nir_mem_access_size_align) { + .bit_size = 32, + .num_components = 1, + .align_mul = 4, + }; + } + break; + + default: + break; + } + + const bool is_load = nir_intrinsic_infos[intrin].has_dest; + const bool is_scratch = intrin == nir_intrinsic_load_scratch || + intrin == nir_intrinsic_store_scratch; + + if (align < 4 || bytes < 4) { + /* Choose a byte, word, or dword */ + bytes = MIN2(bytes, 4); + if (bytes == 3) + bytes = is_load ? 4 : 2; + + if (is_scratch) { + /* The way scratch address swizzling works in the back-end, it + * happens at a DWORD granularity so we can't have a single load + * or store cross a DWORD boundary. + */ + if ((align_offset % 4) + bytes > MIN2(align_mul, 4)) + bytes = MIN2(align_mul, 4) - (align_offset % 4); + + /* Must be a power of two */ + if (bytes == 3) + bytes = 2; + } + + return (nir_mem_access_size_align) { + .bit_size = bytes * 8, + .num_components = 1, + .align_mul = 1, + }; + } else { + bytes = MIN2(bytes, 16); + return (nir_mem_access_size_align) { + .bit_size = 32, + .num_components = is_scratch ? 1 : + is_load ? DIV_ROUND_UP(bytes, 4) : bytes / 4, + .align_mul = 4, + }; + } +} + static void brw_vectorize_lower_mem_access(nir_shader *nir, const struct brw_compiler *compiler, bool is_scalar, bool robust_buffer_access) { - const struct intel_device_info *devinfo = compiler->devinfo; bool progress = false; if (is_scalar) { @@ -1292,7 +1372,7 @@ brw_vectorize_lower_mem_access(nir_shader *nir, OPT(nir_opt_load_store_vectorize, &options); } - OPT(brw_nir_lower_mem_access_bit_sizes, devinfo); + OPT(nir_lower_mem_access_bit_sizes, get_mem_access_size_align, NULL); while (progress) { progress = false; diff --git a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c deleted file mode 100644 index d7c11c9df1c..00000000000 --- a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c +++ /dev/null @@ -1,363 +0,0 @@ -/* - * Copyright © 2018 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_nir.h" -#include "compiler/nir/nir_builder.h" -#include "util/u_math.h" -#include "util/bitscan.h" - -static nir_intrinsic_instr * -dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, - nir_ssa_def *store_src, int offset, - unsigned num_components, unsigned bit_size) -{ - const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic]; - - nir_intrinsic_instr *dup = - nir_intrinsic_instr_create(b->shader, intrin->intrinsic); - - nir_src *intrin_offset_src = nir_get_io_offset_src(intrin); - for (unsigned i = 0; i < info->num_srcs; i++) { - assert(intrin->src[i].is_ssa); - if (i == 0 && store_src) { - assert(!info->has_dest); - assert(&intrin->src[i] != intrin_offset_src); - dup->src[i] = nir_src_for_ssa(store_src); - } else if (&intrin->src[i] == intrin_offset_src) { - dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa, - offset)); - } else { - dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa); - } - } - - dup->num_components = num_components; - if (intrin->intrinsic == nir_intrinsic_load_scratch || - intrin->intrinsic == nir_intrinsic_store_scratch) - assert(num_components == 1); - - for (unsigned i = 0; i < info->num_indices; i++) - dup->const_index[i] = intrin->const_index[i]; - - if (nir_intrinsic_has_access(intrin)) - nir_intrinsic_set_access(dup, nir_intrinsic_access(intrin)); - - const unsigned align_mul = nir_intrinsic_align_mul(intrin); - const unsigned align_offset = - (nir_intrinsic_align_offset(intrin) + (unsigned)offset) % align_mul; - nir_intrinsic_set_align_offset(dup, align_offset); - - if (info->has_dest) { - assert(intrin->dest.is_ssa); - nir_ssa_dest_init(&dup->instr, &dup->dest, - num_components, bit_size, NULL); - } else { - nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1); - } - - nir_builder_instr_insert(b, &dup->instr); - - return dup; -} - -static bool -lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, - const struct intel_device_info *devinfo) -{ - const bool needs_scalar = - intrin->intrinsic == nir_intrinsic_load_scratch; - - assert(intrin->dest.is_ssa); - const unsigned bit_size = intrin->dest.ssa.bit_size; - const unsigned num_components = intrin->dest.ssa.num_components; - const unsigned bytes_read = num_components * (bit_size / 8); - const unsigned align_mul = nir_intrinsic_align_mul(intrin); - const unsigned align_offset = nir_intrinsic_align_offset(intrin); - const unsigned align = nir_intrinsic_align(intrin); - - if (bit_size == 32 && align >= 32 && intrin->num_components <= 4 && - (!needs_scalar || intrin->num_components == 1)) - return false; - - nir_ssa_def *result; - nir_src *offset_src = nir_get_io_offset_src(intrin); - if (bit_size < 32 && !needs_scalar && nir_src_is_const(*offset_src)) { - /* The offset is constant so we can use a 32-bit load and just shift it - * around as needed. - */ - const int load_offset = nir_src_as_uint(*offset_src) % 4; - assert(load_offset % (bit_size / 8) == 0); - const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4); - /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case - * we offset into a component with load_offset. - */ - assert(load_comps32 <= 3); - - nir_intrinsic_instr *load_instr = - dup_mem_intrinsic(b, intrin, NULL, -load_offset, load_comps32, 32); - nir_ssa_def *load = &load_instr->dest.ssa; - result = nir_extract_bits(b, &load, 1, load_offset * 8, - num_components, bit_size); - } else if (bit_size < 32 && intrin->intrinsic == nir_intrinsic_load_task_payload) { - /* In task shaders we lower task payload stores & loads to shared memory, - * so this code should be used only for mesh shaders. - */ - assert(b->shader->info.stage == MESA_SHADER_MESH); - nir_ssa_def *unaligned_offset = nir_ssa_for_src(b, intrin->src[0], 1); - - /* offset aligned to dword */ - nir_ssa_def *aligned_offset = nir_iand_imm(b, unaligned_offset, ~0x3u); - - /* offset from last dword */ - nir_ssa_def *dword_offset = nir_iand_imm(b, unaligned_offset, 0x3u); - - nir_intrinsic_instr *new_load_instr = - dup_mem_intrinsic(b, intrin, NULL, 0, 1, 32); - - nir_ssa_def *new_load = &new_load_instr->dest.ssa; - - nir_instr_rewrite_src_ssa(&new_load_instr->instr, - &new_load_instr->src[0], - aligned_offset); - - /* extract bit_size bits starting from dword_offset * 8 */ - result = nir_iand_imm(b, nir_ishr(b, new_load, - nir_imul_imm(b, dword_offset, 8)), - (1u << bit_size) - 1); - } else { - /* Otherwise, we have to break it into smaller loads. We could end up - * with as many as 32 loads if we're loading a u64vec16 from scratch. - */ - nir_ssa_def *loads[32]; - unsigned num_loads = 0; - int load_offset = 0; - while (load_offset < bytes_read) { - const unsigned bytes_left = bytes_read - load_offset; - unsigned load_bit_size, load_comps; - if (align < 4) { - /* Choose a byte, word, or dword */ - unsigned load_bytes = util_next_power_of_two(MIN2(bytes_left, 4)); - - if (intrin->intrinsic == nir_intrinsic_load_scratch) { - /* The way scratch address swizzling works in the back-end, it - * happens at a DWORD granularity so we can't have a single load - * or store cross a DWORD boundary. - */ - if ((align_offset % 4) + load_bytes > MIN2(align_mul, 4)) - load_bytes = MIN2(align_mul, 4) - (align_offset % 4); - } - - /* Must be a power of two */ - if (load_bytes == 3) - load_bytes = 2; - - load_bit_size = load_bytes * 8; - load_comps = 1; - } else { - assert(load_offset % 4 == 0); - load_bit_size = 32; - load_comps = needs_scalar ? 1 : - DIV_ROUND_UP(MIN2(bytes_left, 16), 4); - } - - nir_intrinsic_instr *load_instr = - dup_mem_intrinsic(b, intrin, NULL, load_offset, load_comps, - load_bit_size); - loads[num_loads++] = &load_instr->dest.ssa; - - load_offset += load_comps * (load_bit_size / 8); - } - assert(num_loads <= ARRAY_SIZE(loads)); - result = nir_extract_bits(b, loads, num_loads, 0, - num_components, bit_size); - } - - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, - result); - nir_instr_remove(&intrin->instr); - - return true; -} - -static bool -lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin, - const struct intel_device_info *devinfo) -{ - const bool needs_scalar = - intrin->intrinsic == nir_intrinsic_store_scratch; - - assert(intrin->src[0].is_ssa); - nir_ssa_def *value = intrin->src[0].ssa; - - assert(intrin->num_components == value->num_components); - const unsigned bit_size = value->bit_size; - const unsigned num_components = intrin->num_components; - const unsigned bytes_written = num_components * (bit_size / 8); - const unsigned align_mul = nir_intrinsic_align_mul(intrin); - const unsigned align_offset = nir_intrinsic_align_offset(intrin); - const unsigned align = nir_intrinsic_align(intrin); - - nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin); - assert(writemask < (1 << num_components)); - - if ((value->bit_size <= 32 && num_components == 1) || - (value->bit_size == 32 && num_components <= 4 && align >= 32 && - writemask == (1 << num_components) - 1 && - !needs_scalar)) - return false; - - nir_src *offset_src = nir_get_io_offset_src(intrin); - const bool offset_is_const = nir_src_is_const(*offset_src); - const unsigned const_offset = - offset_is_const ? nir_src_as_uint(*offset_src) : 0; - - const unsigned byte_size = bit_size / 8; - assert(byte_size <= sizeof(uint64_t)); - - BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t)); - BITSET_ZERO(mask); - - for (unsigned i = 0; i < num_components; i++) { - if (writemask & (1u << i)) - BITSET_SET_RANGE_INSIDE_WORD(mask, i * byte_size, ((i + 1) * byte_size) - 1); - } - - while (BITSET_FFS(mask) != 0) { - const int start = BITSET_FFS(mask) - 1; - - int end; - for (end = start + 1; end < bytes_written; end++) { - if (!(BITSET_TEST(mask, end))) - break; - } - /* The size of the current contiguous chunk in bytes */ - const unsigned chunk_bytes = end - start; - - const bool is_dword_aligned = - (align_mul >= 4 && (align_offset + start) % 4 == 0) || - (offset_is_const && (start + const_offset) % 4 == 0); - - unsigned store_comps, store_bit_size; - if (chunk_bytes >= 4 && is_dword_aligned) { - store_bit_size = 32; - store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4; - } else { - unsigned store_bytes = MIN2(chunk_bytes, 4); - - if (intrin->intrinsic == nir_intrinsic_store_scratch) { - /* The way scratch address swizzling works in the back-end, it - * happens at a DWORD granularity so we can't have a single load - * or store cross a DWORD boundary. - */ - if ((align_offset % 4) + store_bytes > MIN2(align_mul, 4)) - store_bytes = MIN2(align_mul, 4) - (align_offset % 4); - } - - /* Must be a power of two */ - if (store_bytes == 3) - store_bytes = 2; - - store_bit_size = store_bytes * 8; - store_comps = 1; - } - const unsigned store_bytes = store_comps * (store_bit_size / 8); - - nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8, - store_comps, store_bit_size); - - dup_mem_intrinsic(b, intrin, packed, start, - store_comps, store_bit_size); - - BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1)); - } - - nir_instr_remove(&intrin->instr); - - return true; -} - -static bool -lower_mem_access_bit_sizes_instr(nir_builder *b, - nir_instr *instr, - void *cb_data) -{ - const struct intel_device_info *devinfo = cb_data; - - if (instr->type != nir_instr_type_intrinsic) - return false; - - b->cursor = nir_after_instr(instr); - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - switch (intrin->intrinsic) { - case nir_intrinsic_load_global: - case nir_intrinsic_load_global_constant: - case nir_intrinsic_load_ssbo: - case nir_intrinsic_load_shared: - case nir_intrinsic_load_scratch: - case nir_intrinsic_load_task_payload: - return lower_mem_load_bit_size(b, intrin, devinfo); - - case nir_intrinsic_store_global: - case nir_intrinsic_store_ssbo: - case nir_intrinsic_store_shared: - case nir_intrinsic_store_scratch: - case nir_intrinsic_store_task_payload: - return lower_mem_store_bit_size(b, intrin, devinfo); - - default: - return false; - } -} - -/** - * This pass loads arbitrary SSBO and shared memory load/store operations to - * intrinsics which are natively handleable by GEN hardware. In particular, - * we have two general types of memory load/store messages: - * - * - Untyped surface read/write: These can load/store between one and four - * dword components to/from a dword-aligned offset. - * - * - Byte scattered read/write: These can load/store a single byte, word, or - * dword scalar to/from an unaligned byte offset. - * - * Neither type of message can do a write-masked store. This pass converts - * all nir load/store intrinsics into a series of either 8 or 32-bit - * load/store intrinsics with a number of components that we can directly - * handle in hardware and with a trivial write-mask. - * - * For scratch access, additional consideration has to be made due to the way - * that we swizzle the memory addresses to achieve decent cache locality. In - * particular, even though untyped surface read/write messages exist and work, - * we can't use them to load multiple components in a single SEND. For more - * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr. - */ -bool -brw_nir_lower_mem_access_bit_sizes(nir_shader *shader, - const struct intel_device_info *devinfo) -{ - return nir_shader_instructions_pass(shader, lower_mem_access_bit_sizes_instr, - nir_metadata_block_index | - nir_metadata_dominance, - (void *)devinfo); -} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 77b6ec6c304..817583b461c 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -91,7 +91,6 @@ libintel_compiler_files = files( 'brw_nir_lower_cs_intrinsics.c', 'brw_nir_lower_alpha_to_coverage.c', 'brw_nir_lower_intersection_shader.c', - 'brw_nir_lower_mem_access_bit_sizes.c', 'brw_nir_lower_ray_queries.c', 'brw_nir_lower_rt_intrinsics.c', 'brw_nir_lower_scoped_barriers.c',