intel/fs: Make logical URB write instructions more like other logical instructions

The changes to fs_visitor::validate() helped track down a place where I
initially forgot to convert a message to the new sources layout.  This
had caused a different validation failure in
dEQP-GLES31.functional.tessellation.tesscoord.triangles_equal_spacing,
but this were not detected until after SENDs were lowered.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 19951145 -> 19951133 (<.01%)
instructions in affected programs: 2429 -> 2417 (-0.49%)
helped: 8 / HURT: 0

total cycles in shared programs: 858904152 -> 858862331 (<.01%)
cycles in affected programs: 5702652 -> 5660831 (-0.73%)
helped: 2138 / HURT: 1255

Broadwell
total cycles in shared programs: 904869459 -> 904835501 (<.01%)
cycles in affected programs: 7686744 -> 7652786 (-0.44%)
helped: 2861 / HURT: 2050

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 141442369 -> 141442032 (-0.0%)
Instructions helped: 337

Cycles in all programs: 9099270231 -> 9099036492 (-0.0%)
Cycles helped: 40661
Cycles hurt: 28606

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>
This commit is contained in:
Ian Romanick
2022-07-12 15:32:01 -07:00
committed by Marge Bot
parent 5dab077824
commit 349a040f68
7 changed files with 174 additions and 119 deletions

View File

@@ -935,22 +935,15 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (length == 8 || (length > 0 && slot == last_slot))
flush = true;
if (flush) {
fs_reg *payload_sources =
ralloc_array(mem_ctx, fs_reg, length + header_size);
fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
BRW_REGISTER_TYPE_F);
payload_sources[0] = urb_handle;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
if (opcode == SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL)
payload_sources[1] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
memcpy(&payload_sources[header_size], sources,
length * sizeof sources[0]);
abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
header_size);
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
/* For ICL WA 1805992985 one needs additional write in the end. */
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
@@ -985,10 +978,17 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (stage == MESA_SHADER_GEOMETRY)
return;
fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
bld.exec_all().MOV(payload, urb_handle);
fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
bld.exec_all().MOV(uniform_urb_handle, urb_handle);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->mlen = 2;
inst->offset = 1;
@@ -1002,14 +1002,16 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* all 8 lanes must valid.
*/
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
/* Workaround requires all 8 channels (lanes) to be valid. This is
* understood to mean they all need to be alive. First trick is to find
* a live channel and copy its urb handle for all the other channels to
* make sure all handles are valid.
*/
bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
/* Second trick is to use masked URB write where one can tell the HW to
* actually write data only for selected channels even though all are
@@ -1025,14 +1027,19 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* 4 slots data. All are explicitly zeros in order to to keep the MBZ
* area written as zeros.
*/
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload);
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->mlen = 6;
inst->offset = 0;