intel/compiler: Use subgroup invocation for ICP handle loads
When loading a TCS or GS input, we generate some code to read the URB handle for a particular input control point (ICP handle), which often involves indirect addressing due to a non-constant vertex. For example: mov(8) vgrf148+0.0:UW, 76543210V shl(8) vgrf149:UD, vgrf148+0.0:UW, 2u shl(8) vgrf150:UD, vgrf145:UD, 5u add(8) vgrf151:UD, vgrf150:UD, vgrf149:UD mov_indirect(8) vgrf147:UD, g2:UD, vgrf151:UD, 96u Unfortunately, the first load with 76543210V is considered a partial write because the 8 channels of 16-bit UW data doesn't fill an entire register, and we can't allocate VGRFs at sub-register granularity. This causes none of the above math to be CSE'd, even though the first two instructions are common to *all* input loads, and the rest may be reused sometimes as well. To work around this, we stop emitting 76543210V to a temporary, and instead use nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], which already contains this value, and is unconditionally set up for us. With all input loads using the same register for the sequence, our CSE pass is able to eliminate the rest of the common math. shader-db results on Tigerlake: total instructions in shared programs: 20748243 -> 20744844 (-0.02%) instructions in affected programs: 73410 -> 70011 (-4.63%) helped: 242 / HURT: 21 helped stats (abs) min: 1 max: 37 x̄: 14.17 x̃: 15 helped stats (rel) min: 0.17% max: 19.58% x̄: 6.13% x̃: 6.32% HURT stats (abs) min: 1 max: 4 x̄: 1.38 x̃: 1 HURT stats (rel) min: 0.18% max: 1.31% x̄: 0.58% x̃: 0.58% 95% mean confidence interval for instructions value: -13.73 -12.12 95% mean confidence interval for instructions %-change: -6.00% -5.19% Instructions are helped. total cycles in shared programs: 785828951 -> 785788480 (<.01%) cycles in affected programs: 597593 -> 557122 (-6.77%) helped: 227 / HURT: 13 helped stats (abs) min: 6 max: 624 x̄: 182.19 x̃: 185 helped stats (rel) min: 0.24% max: 18.22% x̄: 7.85% x̃: 7.80% HURT stats (abs) min: 2 max: 153 x̄: 68.08 x̃: 36 HURT stats (rel) min: 0.03% max: 7.79% x̄: 2.97% x̃: 1.25% 95% mean confidence interval for cycles value: -182.55 -154.71 95% mean confidence interval for cycles %-change: -7.84% -6.69% Cycles are helped. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18455>
This commit is contained in:

committed by
Marge Bot

parent
4d7fe94f3a
commit
19fc870ac6
@@ -2542,13 +2542,12 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
|
||||
* by 32 (shifting by 5), and add the two together. This is
|
||||
* the final indirect byte offset.
|
||||
*/
|
||||
fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
|
||||
fs_reg sequence =
|
||||
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
|
||||
/* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
|
||||
bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
|
||||
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
|
||||
bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
|
||||
/* Convert vertex_index to bytes (multiply by 32) */
|
||||
@@ -2780,13 +2779,11 @@ fs_visitor::get_tcs_multi_patch_icp_handle(const fs_builder &bld,
|
||||
* the final indirect byte offset.
|
||||
*/
|
||||
fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
|
||||
fs_reg sequence = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
||||
|
||||
/* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
|
||||
bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
|
||||
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
|
||||
bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
|
||||
/* Convert vertex_index to bytes (multiply by 32) */
|
||||
|
Reference in New Issue
Block a user