intel/nir: Enable load/store vectorization
This commit enables the I/O vectorization pass that was originally written for ACO for Intel drivers. We enable it for UBOs, SSBOs, global memory, and SLM. We only enable vectorization for the scalar back-end because it vec4 makes certain alignment assumptions. Shader-db results with iris on ICL: total instructions in shared programs: 16077927 -> 16068236 (-0.06%) instructions in affected programs: 199839 -> 190148 (-4.85%) helped: 324 HURT: 0 helped stats (abs) min: 2 max: 458 x̄: 29.91 x̃: 4 helped stats (rel) min: 0.11% max: 38.94% x̄: 4.32% x̃: 1.64% 95% mean confidence interval for instructions value: -37.02 -22.80 95% mean confidence interval for instructions %-change: -5.07% -3.58% Instructions are helped. total cycles in shared programs: 336806135 -> 336151501 (-0.19%) cycles in affected programs: 16009735 -> 15355101 (-4.09%) helped: 458 HURT: 154 helped stats (abs) min: 1 max: 77812 x̄: 1542.50 x̃: 75 helped stats (rel) min: <.01% max: 34.46% x̄: 5.16% x̃: 2.01% HURT stats (abs) min: 1 max: 22800 x̄: 336.55 x̃: 20 HURT stats (rel) min: <.01% max: 17.11% x̄: 2.12% x̃: 1.00% 95% mean confidence interval for cycles value: -1596.83 -542.49 95% mean confidence interval for cycles %-change: -3.83% -2.82% Cycles are helped. total sends in shared programs: 814177 -> 809049 (-0.63%) sends in affected programs: 15422 -> 10294 (-33.25%) helped: 324 HURT: 0 helped stats (abs) min: 1 max: 256 x̄: 15.83 x̃: 2 helped stats (rel) min: 1.33% max: 67.90% x̄: 21.21% x̃: 15.38% 95% mean confidence interval for sends value: -19.67 -11.98 95% mean confidence interval for sends %-change: -23.03% -19.39% Sends are helped. LOST: 7 GAINED: 2 Most of the helped shaders were in the following titles: - Doom - Deus Ex: Mankind Divided - Aztec Ruins - Shadow of Mordor - DiRT Showdown - Tomb Raider (Rise, I think) Five of the lost programs are SIMD16 shaders we lost from dirt showdown. The other two are compute shaders in Aztec Ruins which switched from SIMD8 to SIMD16. Vulkan pipeline-db stats on ICL: Instructions in all programs: 296780486 -> 293493363 (-1.1%) Loops in all programs: 149669 -> 149669 (+0.0%) Cycles in all programs: 90999206722 -> 88513844563 (-2.7%) Spills in all programs: 1710217 -> 1730691 (+1.2%) Fills in all programs: 1931235 -> 1958138 (+1.4%) By far the most help was in the Tomb Raider games. A couple of Batman games with DXVK were also helped. In Shadow of the Tomb Raider: Instructions in all programs: 41614336 -> 39408023 (-5.3%) Loops in all programs: 32200 -> 32200 (+0.0%) Cycles in all programs: 1875498485 -> 1667034831 (-11.1%) Spills in all programs: 196307 -> 214945 (+9.5%) Fills in all programs: 282736 -> 307113 (+8.6%) Benchmarks of real games I've done on this patch: - Rise of the Tomb Raider: +3% - Shadow of the Tomb Raider: +10% Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4367> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4367>
This commit is contained in:

committed by
Marge Bot

parent
36a32af008
commit
991c426160
@@ -824,6 +824,31 @@ brw_nir_link_shaders(const struct brw_compiler *compiler,
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
brw_nir_should_vectorize_mem(unsigned align, unsigned bit_size,
|
||||
unsigned num_components, unsigned high_offset,
|
||||
nir_intrinsic_instr *low,
|
||||
nir_intrinsic_instr *high)
|
||||
{
|
||||
/* Don't combine things to generate 64-bit loads/stores. We have to split
|
||||
* those back into 32-bit ones anyway and UBO loads aren't split in NIR so
|
||||
* we don't want to make a mess for the back-end.
|
||||
*/
|
||||
if (bit_size > 32)
|
||||
return false;
|
||||
|
||||
/* We can handle at most a vec4 right now. Anything bigger would get
|
||||
* immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
|
||||
*/
|
||||
if (num_components > 4)
|
||||
return false;
|
||||
|
||||
if (align < bit_size / 8)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static
|
||||
bool combine_all_barriers(nir_intrinsic_instr *a,
|
||||
nir_intrinsic_instr *b,
|
||||
@@ -844,6 +869,35 @@ bool combine_all_barriers(nir_intrinsic_instr *a,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
brw_vectorize_lower_mem_access(nir_shader *nir,
|
||||
const struct brw_compiler *compiler,
|
||||
bool is_scalar)
|
||||
{
|
||||
const struct gen_device_info *devinfo = compiler->devinfo;
|
||||
bool progress = false;
|
||||
|
||||
if (is_scalar) {
|
||||
OPT(nir_opt_load_store_vectorize,
|
||||
nir_var_mem_ubo | nir_var_mem_ssbo |
|
||||
nir_var_mem_global | nir_var_mem_shared,
|
||||
brw_nir_should_vectorize_mem);
|
||||
}
|
||||
|
||||
OPT(brw_nir_lower_mem_access_bit_sizes, devinfo);
|
||||
|
||||
while (progress) {
|
||||
progress = false;
|
||||
|
||||
OPT(nir_lower_pack);
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
OPT(nir_opt_cse);
|
||||
OPT(nir_opt_algebraic);
|
||||
OPT(nir_opt_constant_folding);
|
||||
}
|
||||
}
|
||||
|
||||
/* Prepare the given shader for codegen
|
||||
*
|
||||
* This function is intended to be called right before going into the actual
|
||||
@@ -870,17 +924,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
||||
|
||||
brw_nir_optimize(nir, compiler, is_scalar, false);
|
||||
|
||||
if (OPT(brw_nir_lower_mem_access_bit_sizes, devinfo)) {
|
||||
do {
|
||||
progress = false;
|
||||
OPT(nir_lower_pack);
|
||||
OPT(nir_copy_prop);
|
||||
OPT(nir_opt_dce);
|
||||
OPT(nir_opt_cse);
|
||||
OPT(nir_opt_algebraic);
|
||||
OPT(nir_opt_constant_folding);
|
||||
} while (progress);
|
||||
}
|
||||
brw_vectorize_lower_mem_access(nir, compiler, is_scalar);
|
||||
|
||||
if (OPT(nir_lower_int64, nir->options->lower_int64_options))
|
||||
brw_nir_optimize(nir, compiler, is_scalar, false);
|
||||
|
Reference in New Issue
Block a user