intel/fs: make use of load_ubo_uniform_block_intel

The principle is the same as the load_ssbo_uniform_block_intel.
Whenever we see a uniform offset, load the data only once in GRFs to
reduce register pressure.

Iris shader-db run on DG2 :

total instructions in shared programs: 23001325 -> 23094969 (0.41%)
instructions in affected programs: 1775989 -> 1869633 (5.27%)
helped: 764
HURT: 2097
helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2
helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63%
HURT stats (abs)   min: 1 max: 2461 x̄: 47.19 x̃: 7
HURT stats (rel)   min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60%
95% mean confidence interval for instructions value: 25.43 40.03
95% mean confidence interval for instructions %-change: 3.60% 4.33%
Instructions are HURT.

total loops in shared programs: 5847 -> 5847 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total cycles in shared programs: 839329852 -> 845491482 (0.73%)
cycles in affected programs: 130229434 -> 136391064 (4.73%)
helped: 1098
HURT: 2228
helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22
helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71%
HURT stats (abs)   min: 1 max: 185309 x̄: 3426.24 x̃: 87
HURT stats (rel)   min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82%
95% mean confidence interval for cycles value: 1342.16 2362.97
95% mean confidence interval for cycles %-change: 3.70% 4.52%
Cycles are HURT.

total spills in shared programs: 10768 -> 11856 (10.10%)
spills in affected programs: 9717 -> 10805 (11.20%)
helped: 25
HURT: 28

total fills in shared programs: 13720 -> 16258 (18.50%)
fills in affected programs: 12016 -> 14554 (21.12%)
helped: 25
HURT: 28

total sends in shared programs: 1034790 -> 1031266 (-0.34%)
sends in affected programs: 33416 -> 29892 (-10.55%)
helped: 1005
HURT: 0
helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3
helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08%
95% mean confidence interval for sends value: -3.72 -3.29
95% mean confidence interval for sends %-change: -15.82% -14.57%
Sends are helped.

LOST:   26
GAINED: 183

shader-db on a number of VK/DX titles on DG2 :

 PERCENTAGE DELTAS  Shaders   Instrs    Cycles
 age_of_wonders_III 1928      +0.02%    -0.19%

 PERCENTAGE DELTAS       Shaders   Instrs    Cycles  Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width
 assassins_creed_odyssey 2119      +1.12%    -0.42%      -0.03%        -0.29%       -9.10%     -4.26%         -0.64%             +0.65%

 PERCENTAGE DELTAS Shaders   Instrs    Cycles  Spill count Fill count Max live registers
 aztec_ruins_high  269       -0.05%    -0.45%     -0.29%     -7.27%         -0.33%

 PERCENTAGE DELTAS    Shaders   Instrs    Cycles  Max live registers Max dispatch width
 dark_souls_3_dxvk_g2 1420      +0.09%    +0.24%        +0.21%             +0.12%

(stats look bad, but it's just one shader affected)
 PERCENTAGE DELTAS Shaders   Instrs    Cycles  Spill count Fill count Scratch Memory Size Max live registers
 fallout_4_dxvk_g2 1638      +0.67%    +8.32%    +16.02%     +7.17%         +100.00%            +0.48%

 PERCENTAGE DELTAS    Shaders   Instrs    Cycles  Send messages Spill count Fill count Max live registers Max dispatch width
 red_dead_redemption2 5969      +0.16%    -0.04%      -0.04%       +0.01%     +0.05%         -0.20%             +0.04%

 PERCENTAGE DELTAS          Shaders   Instrs    Cycles  Send messages Max live registers Max dispatch width
 rise_of_the_tomb_raider_g2 12129     +2.19%    +1.36%      -1.23%          -0.36%             +2.04%

 PERCENTAGE DELTAS Shaders   Instrs    Cycles  Send messages Max live registers
 shooter-game      693       +0.07%    -0.89%      -0.09%          -0.09%

 PERCENTAGE DELTAS Shaders   Instrs    Cycles  Send messages Max live registers Max dispatch width
 talos_g2          1140      +0.37%    +3.80%      -0.86%          -0.67%             +0.19%

 PERCENTAGE DELTAS    Shaders   Instrs    Cycles  Max live registers Max dispatch width
 total_war_warhammer2 477       +0.25%    +0.66%        -0.17%             +0.10%

 PERCENTAGE DELTAS Shaders   Instrs    Cycles  Send messages Max live registers Max dispatch width
 witcher_3_dxvk_g2 1074      +0.75%   -10.45%      -0.15%          -0.16%             -0.16%

 PERCENTAGE DELTAS      Shaders   Instrs    Cycles  Send messages Max live registers
 wolfenstein_youngblood 1111      +0.52%    +0.66%      -0.59%          -0.03%

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
This commit is contained in:
Lionel Landwerlin
2023-06-06 18:03:26 +03:00
committed by Marge Bot
parent 4a23a5a904
commit 5ae8a78d8c
5 changed files with 75 additions and 11 deletions

View File

@@ -1765,7 +1765,8 @@ store("shared_block_intel", [1], [BASE, WRITE_MASK, ALIGN_MUL, ALIGN_OFFSET])
# Similar to load_global_const_block_intel but for UBOs # Similar to load_global_const_block_intel but for UBOs
# offset should be uniform # offset should be uniform
# src[] = { buffer_index, offset }. # src[] = { buffer_index, offset }.
load("ubo_uniform_block_intel", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], [CAN_ELIMINATE]) load("ubo_uniform_block_intel", [-1, 1],
[ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], [CAN_ELIMINATE, CAN_REORDER])
# Similar to load_global_const_block_intel but for SSBOs # Similar to load_global_const_block_intel but for SSBOs
# offset should be uniform # offset should be uniform

View File

@@ -4836,7 +4836,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break; break;
} }
case nir_intrinsic_load_ubo: { case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ubo_uniform_block_intel: {
fs_reg surface, surface_handle; fs_reg surface, surface_handle;
if (get_nir_src_bindless(instr->src[0])) if (get_nir_src_bindless(instr->src[0]))
@@ -4845,16 +4846,72 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
surface = get_nir_buffer_intrinsic_index(bld, instr); surface = get_nir_buffer_intrinsic_index(bld, instr);
if (!nir_src_is_const(instr->src[1])) { if (!nir_src_is_const(instr->src[1])) {
fs_reg base_offset = retype(get_nir_src(instr->src[1]), if (instr->intrinsic == nir_intrinsic_load_ubo) {
BRW_REGISTER_TYPE_UD); /* load_ubo with non-uniform offset */
fs_reg base_offset = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
for (int i = 0; i < instr->num_components; i++) for (int i = 0; i < instr->num_components; i++)
VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
surface, surface_handle, surface, surface_handle,
base_offset, i * type_sz(dest.type), base_offset, i * type_sz(dest.type),
nir_dest_bit_size(instr->dest) / 8); nir_dest_bit_size(instr->dest) / 8);
prog_data->has_ubo_pull = true; prog_data->has_ubo_pull = true;
} else {
/* load_ubo with uniform offset */
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
const nir_src load_offset = instr->src[1];
if (nir_src_is_const(load_offset)) {
fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
bld.emit_uniformize(get_nir_src(load_offset));
}
const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4);
unsigned loaded_dwords = 0;
const fs_reg packed_consts =
ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
while (loaded_dwords < total_dwords) {
const unsigned block =
choose_oword_block_size_dwords(devinfo,
total_dwords - loaded_dwords);
const unsigned block_bytes = block * 4;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = align(block_bytes, REG_SIZE);
loaded_dwords += block;
ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
srcs[SURFACE_LOGICAL_SRC_ADDRESS],
brw_imm_ud(block_bytes));
}
for (unsigned c = 0; c < instr->num_components; c++) {
bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
component(packed_consts, c));
}
prog_data->has_ubo_pull = true;
}
} else { } else {
/* Even if we are loading doubles, a pull constant load will load /* Even if we are loading doubles, a pull constant load will load
* a 32-bit vec4, so should only reserve vgrf space for that. If we * a 32-bit vec4, so should only reserve vgrf space for that. If we

View File

@@ -1308,6 +1308,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
return false; return false;
if (low->intrinsic == nir_intrinsic_load_global_const_block_intel || if (low->intrinsic == nir_intrinsic_load_global_const_block_intel ||
low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel || low->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) { low->intrinsic == nir_intrinsic_load_shared_uniform_block_intel) {
if (num_components > 4) { if (num_components > 4) {

View File

@@ -37,6 +37,7 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b,
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) { switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo: case nir_intrinsic_load_ssbo:
/* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite: /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
* *
@@ -60,7 +61,10 @@ brw_nir_blockify_uniform_loads_instr(nir_builder *b,
if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4) if (!devinfo->has_lsc && nir_dest_num_components(intrin->dest) < 4)
return false; return false;
intrin->intrinsic = nir_intrinsic_load_ssbo_uniform_block_intel; intrin->intrinsic =
intrin->intrinsic == nir_intrinsic_load_ubo ?
nir_intrinsic_load_ubo_uniform_block_intel :
nir_intrinsic_load_ssbo_uniform_block_intel;
return true; return true;
case nir_intrinsic_load_shared: case nir_intrinsic_load_shared:

View File

@@ -92,6 +92,7 @@ brw_nir_lower_non_uniform_intrinsic(nir_builder *b,
case nir_intrinsic_ssbo_atomic_swap: case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_load_ssbo_block_intel: case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_store_ssbo_block_intel: case nir_intrinsic_store_ssbo_block_intel:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_load_ssbo_uniform_block_intel: case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_image_load_raw_intel: case nir_intrinsic_image_load_raw_intel:
case nir_intrinsic_image_store_raw_intel: case nir_intrinsic_image_store_raw_intel: