intel/fs: optimize uniform SSBO & shared loads
Using divergence analysis, figure out when SSBO & shared memory loads are uniform and carry the data only once in register space. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21853>
This commit is contained in:

committed by
Marge Bot

parent
2cf93f7632
commit
a358b97c58
@@ -4938,6 +4938,62 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_load_ssbo_uniform_block_intel:
|
||||||
|
case nir_intrinsic_load_shared_uniform_block_intel: {
|
||||||
|
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
|
||||||
|
|
||||||
|
const bool is_ssbo =
|
||||||
|
instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
|
||||||
|
get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));
|
||||||
|
|
||||||
|
const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4);
|
||||||
|
unsigned loaded_dwords = 0;
|
||||||
|
|
||||||
|
const fs_builder ubld1 = bld.exec_all().group(1, 0);
|
||||||
|
const fs_builder ubld8 = bld.exec_all().group(8, 0);
|
||||||
|
const fs_builder ubld16 = bld.exec_all().group(16, 0);
|
||||||
|
|
||||||
|
const fs_reg packed_consts =
|
||||||
|
ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
|
||||||
|
|
||||||
|
const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
|
||||||
|
if (nir_src_is_const(load_offset)) {
|
||||||
|
fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
|
ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
|
||||||
|
} else {
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||||
|
bld.emit_uniformize(get_nir_src(load_offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
while (loaded_dwords < total_dwords) {
|
||||||
|
const unsigned block =
|
||||||
|
choose_oword_block_size_dwords(devinfo,
|
||||||
|
total_dwords - loaded_dwords);
|
||||||
|
const unsigned block_bytes = block * 4;
|
||||||
|
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
|
||||||
|
|
||||||
|
const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
|
||||||
|
ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
|
||||||
|
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
|
||||||
|
srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = align(block_bytes, REG_SIZE);
|
||||||
|
|
||||||
|
loaded_dwords += block;
|
||||||
|
|
||||||
|
ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
|
||||||
|
srcs[SURFACE_LOGICAL_SRC_ADDRESS],
|
||||||
|
brw_imm_ud(block_bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned c = 0; c < instr->num_components; c++)
|
||||||
|
bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
|
||||||
|
component(packed_consts, c));
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case nir_intrinsic_store_output: {
|
case nir_intrinsic_store_output: {
|
||||||
assert(nir_src_bit_size(instr->src[0]) == 32);
|
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||||
fs_reg src = get_nir_src(instr->src[0]);
|
fs_reg src = get_nir_src(instr->src[0]);
|
||||||
|
@@ -1301,11 +1301,25 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
|
|||||||
if (bit_size > 32)
|
if (bit_size > 32)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
/* We can handle at most a vec4 right now. Anything bigger would get
|
if (low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
|
||||||
* immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
|
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) {
|
||||||
*/
|
if (num_components > 4) {
|
||||||
if (num_components > 4)
|
if (!util_is_power_of_two_nonzero(num_components))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
if (bit_size != 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (num_components > 32)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* We can handle at most a vec4 right now. Anything bigger would get
|
||||||
|
* immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
|
||||||
|
*/
|
||||||
|
if (num_components > 4)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
uint32_t align;
|
uint32_t align;
|
||||||
@@ -1447,6 +1461,31 @@ brw_vectorize_lower_mem_access(nir_shader *nir,
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPT(nir_opt_load_store_vectorize, &options);
|
OPT(nir_opt_load_store_vectorize, &options);
|
||||||
|
|
||||||
|
/* Only run the blockify optimization on Gfx9+ because although prior HW
|
||||||
|
* versions have support for block loads, they do have limitations on
|
||||||
|
* alignment as well as requiring split sends which are not supported
|
||||||
|
* there.
|
||||||
|
*/
|
||||||
|
if (compiler->devinfo->ver >= 9) {
|
||||||
|
/* Required for nir_divergence_analysis() */
|
||||||
|
OPT(nir_convert_to_lcssa, true, true);
|
||||||
|
|
||||||
|
/* When HW supports block loads, using the divergence analysis, try
|
||||||
|
* to find uniform SSBO loads and turn them into block loads.
|
||||||
|
*
|
||||||
|
* Rerun the vectorizer after that to make the largest possible block
|
||||||
|
* loads.
|
||||||
|
*
|
||||||
|
* This is a win on 2 fronts :
|
||||||
|
* - fewer send messages
|
||||||
|
* - reduced register pressure
|
||||||
|
*/
|
||||||
|
nir_divergence_analysis(nir);
|
||||||
|
if (OPT(brw_nir_blockify_uniform_loads, compiler->devinfo))
|
||||||
|
OPT(nir_opt_load_store_vectorize, &options);
|
||||||
|
OPT(nir_opt_remove_phis);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPT(nir_lower_mem_access_bit_sizes,
|
OPT(nir_lower_mem_access_bit_sizes,
|
||||||
|
@@ -187,6 +187,9 @@ bool brw_nir_opt_peephole_imul32x16(nir_shader *shader);
|
|||||||
bool brw_nir_clamp_per_vertex_loads(nir_shader *shader,
|
bool brw_nir_clamp_per_vertex_loads(nir_shader *shader,
|
||||||
unsigned input_vertices);
|
unsigned input_vertices);
|
||||||
|
|
||||||
|
bool brw_nir_blockify_uniform_loads(nir_shader *shader,
|
||||||
|
const struct intel_device_info *devinfo);
|
||||||
|
|
||||||
void brw_nir_optimize(nir_shader *nir,
|
void brw_nir_optimize(nir_shader *nir,
|
||||||
const struct brw_compiler *compiler,
|
const struct brw_compiler *compiler,
|
||||||
bool is_scalar);
|
bool is_scalar);
|
||||||
|
101
src/intel/compiler/brw_nir_blockify_uniform_loads.c
Normal file
101
src/intel/compiler/brw_nir_blockify_uniform_loads.c
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
/*
|
||||||
|
* Copyright © 2018 Intel Corporation
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "isl/isl.h"
|
||||||
|
|
||||||
|
#include "brw_nir.h"
|
||||||
|
|
||||||
|
static bool
|
||||||
|
brw_nir_blockify_uniform_loads_instr(nir_builder *b,
|
||||||
|
nir_instr *instr,
|
||||||
|
void *cb_data)
|
||||||
|
{
|
||||||
|
if (instr->type != nir_instr_type_intrinsic)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const struct intel_device_info *devinfo = cb_data;
|
||||||
|
|
||||||
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||||
|
switch (intrin->intrinsic) {
|
||||||
|
case nir_intrinsic_load_ssbo:
|
||||||
|
/* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
|
||||||
|
*
|
||||||
|
* "The surface base address must be OWord-aligned."
|
||||||
|
*
|
||||||
|
* We can't make that guarantee with SSBOs where the alignment is
|
||||||
|
* 4bytes.
|
||||||
|
*/
|
||||||
|
if (devinfo->ver < 9)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (nir_src_is_divergent(intrin->src[1]))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (nir_dest_bit_size(intrin->dest) != 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
||||||
|
* oword).
|
||||||
|
*/
|
||||||
|
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
intrin->intrinsic = nir_intrinsic_load_ssbo_uniform_block_intel;
|
||||||
|
return true;
|
||||||
|
|
||||||
|
case nir_intrinsic_load_shared:
|
||||||
|
/* Block loads on shared memory are not supported before the LSC. */
|
||||||
|
if (!devinfo->has_lsc)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (nir_src_is_divergent(intrin->src[0]))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (nir_dest_bit_size(intrin->dest) != 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Without the LSC, we can only do block loads of at least 4dwords (1
|
||||||
|
* oword).
|
||||||
|
*/
|
||||||
|
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
|
||||||
|
return true;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
brw_nir_blockify_uniform_loads(nir_shader *shader,
|
||||||
|
const struct intel_device_info *devinfo)
|
||||||
|
{
|
||||||
|
return nir_shader_instructions_pass(shader,
|
||||||
|
brw_nir_blockify_uniform_loads_instr,
|
||||||
|
nir_metadata_block_index |
|
||||||
|
nir_metadata_dominance |
|
||||||
|
nir_metadata_live_ssa_defs,
|
||||||
|
(void *) devinfo);
|
||||||
|
}
|
@@ -86,6 +86,7 @@ libintel_compiler_files = files(
|
|||||||
'brw_nir_analyze_boolean_resolves.c',
|
'brw_nir_analyze_boolean_resolves.c',
|
||||||
'brw_nir_analyze_ubo_ranges.c',
|
'brw_nir_analyze_ubo_ranges.c',
|
||||||
'brw_nir_attribute_workarounds.c',
|
'brw_nir_attribute_workarounds.c',
|
||||||
|
'brw_nir_blockify_uniform_loads.c',
|
||||||
'brw_nir_clamp_per_vertex_loads.c',
|
'brw_nir_clamp_per_vertex_loads.c',
|
||||||
'brw_nir_lower_conversions.c',
|
'brw_nir_lower_conversions.c',
|
||||||
'brw_nir_lower_cs_intrinsics.c',
|
'brw_nir_lower_cs_intrinsics.c',
|
||||||
|
Reference in New Issue
Block a user