diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 89efccaad37..d7415526d2e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4938,6 +4938,62 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_load_shared_uniform_block_intel: { + fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; + + const bool is_ssbo = + instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel; + srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? + get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); + + const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4); + unsigned loaded_dwords = 0; + + const fs_builder ubld1 = bld.exec_all().group(1, 0); + const fs_builder ubld8 = bld.exec_all().group(8, 0); + const fs_builder ubld16 = bld.exec_all().group(16, 0); + + const fs_reg packed_consts = + ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords); + + const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0]; + if (nir_src_is_const(load_offset)) { + fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD); + ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset))); + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0); + } else { + srcs[SURFACE_LOGICAL_SRC_ADDRESS] = + bld.emit_uniformize(get_nir_src(load_offset)); + } + + while (loaded_dwords < total_dwords) { + const unsigned block = + choose_oword_block_size_dwords(devinfo, + total_dwords - loaded_dwords); + const unsigned block_bytes = block * 4; + + srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); + + const fs_builder &ubld = block <= 8 ? ubld8 : ubld16; + ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, + retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD), + srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = align(block_bytes, REG_SIZE); + + loaded_dwords += block; + + ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], + srcs[SURFACE_LOGICAL_SRC_ADDRESS], + brw_imm_ud(block_bytes)); + } + + for (unsigned c = 0; c < instr->num_components; c++) + bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD), + component(packed_consts, c)); + + break; + } + case nir_intrinsic_store_output: { assert(nir_src_bit_size(instr->src[0]) == 32); fs_reg src = get_nir_src(instr->src[0]); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 0dbefc288d4..777e3e252c8 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1301,11 +1301,25 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, if (bit_size > 32) return false; - /* We can handle at most a vec4 right now. Anything bigger would get - * immediately split by brw_nir_lower_mem_access_bit_sizes anyway. - */ - if (num_components > 4) - return false; + if (low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || + low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) { + if (num_components > 4) { + if (!util_is_power_of_two_nonzero(num_components)) + return false; + + if (bit_size != 32) + return false; + + if (num_components > 32) + return false; + } + } else { + /* We can handle at most a vec4 right now. Anything bigger would get + * immediately split by brw_nir_lower_mem_access_bit_sizes anyway. + */ + if (num_components > 4) + return false; + } uint32_t align; @@ -1447,6 +1461,31 @@ brw_vectorize_lower_mem_access(nir_shader *nir, } OPT(nir_opt_load_store_vectorize, &options); + + /* Only run the blockify optimization on Gfx9+ because although prior HW + * versions have support for block loads, they do have limitations on + * alignment as well as requiring split sends which are not supported + * there. + */ + if (compiler->devinfo->ver >= 9) { + /* Required for nir_divergence_analysis() */ + OPT(nir_convert_to_lcssa, true, true); + + /* When HW supports block loads, using the divergence analysis, try + * to find uniform SSBO loads and turn them into block loads. + * + * Rerun the vectorizer after that to make the largest possible block + * loads. + * + * This is a win on 2 fronts : + * - fewer send messages + * - reduced register pressure + */ + nir_divergence_analysis(nir); + if (OPT(brw_nir_blockify_uniform_loads, compiler->devinfo)) + OPT(nir_opt_load_store_vectorize, &options); + OPT(nir_opt_remove_phis); + } } OPT(nir_lower_mem_access_bit_sizes, diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 423b5588787..1a564f09675 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -187,6 +187,9 @@ bool brw_nir_opt_peephole_imul32x16(nir_shader *shader); bool brw_nir_clamp_per_vertex_loads(nir_shader *shader, unsigned input_vertices); +bool brw_nir_blockify_uniform_loads(nir_shader *shader, + const struct intel_device_info *devinfo); + void brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, bool is_scalar); diff --git a/src/intel/compiler/brw_nir_blockify_uniform_loads.c b/src/intel/compiler/brw_nir_blockify_uniform_loads.c new file mode 100644 index 00000000000..adc70bd0c33 --- /dev/null +++ b/src/intel/compiler/brw_nir_blockify_uniform_loads.c @@ -0,0 +1,101 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "isl/isl.h" + +#include "brw_nir.h" + +static bool +brw_nir_blockify_uniform_loads_instr(nir_builder *b, + nir_instr *instr, + void *cb_data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + const struct intel_device_info *devinfo = cb_data; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + switch (intrin->intrinsic) { + case nir_intrinsic_load_ssbo: + /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite: + * + * "The surface base address must be OWord-aligned." + * + * We can't make that guarantee with SSBOs where the alignment is + * 4bytes. + */ + if (devinfo->ver < 9) + return false; + + if (nir_src_is_divergent(intrin->src[1])) + return false; + + if (nir_dest_bit_size(intrin->dest) != 32) + return false; + + /* Without the LSC, we can only do block loads of at least 4dwords (1 + * oword). + */ + if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) + return false; + + intrin->intrinsic = nir_intrinsic_load_ssbo_uniform_block_intel; + return true; + + case nir_intrinsic_load_shared: + /* Block loads on shared memory are not supported before the LSC. */ + if (!devinfo->has_lsc) + return false; + + if (nir_src_is_divergent(intrin->src[0])) + return false; + + if (nir_dest_bit_size(intrin->dest) != 32) + return false; + + /* Without the LSC, we can only do block loads of at least 4dwords (1 + * oword). + */ + if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) + return false; + + intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel; + return true; + + default: + return false; + } +} + +bool +brw_nir_blockify_uniform_loads(nir_shader *shader, + const struct intel_device_info *devinfo) +{ + return nir_shader_instructions_pass(shader, + brw_nir_blockify_uniform_loads_instr, + nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs, + (void *) devinfo); +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index c3010d6dac7..9a03d372598 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -86,6 +86,7 @@ libintel_compiler_files = files( 'brw_nir_analyze_boolean_resolves.c', 'brw_nir_analyze_ubo_ranges.c', 'brw_nir_attribute_workarounds.c', + 'brw_nir_blockify_uniform_loads.c', 'brw_nir_clamp_per_vertex_loads.c', 'brw_nir_lower_conversions.c', 'brw_nir_lower_cs_intrinsics.c',