intel/fs: Make logical URB write instructions more like other logical instructions

The changes to fs_visitor::validate() helped track down a place where I
initially forgot to convert a message to the new sources layout.  This
had caused a different validation failure in
dEQP-GLES31.functional.tessellation.tesscoord.triangles_equal_spacing,
but this were not detected until after SENDs were lowered.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 19951145 -> 19951133 (<.01%)
instructions in affected programs: 2429 -> 2417 (-0.49%)
helped: 8 / HURT: 0

total cycles in shared programs: 858904152 -> 858862331 (<.01%)
cycles in affected programs: 5702652 -> 5660831 (-0.73%)
helped: 2138 / HURT: 1255

Broadwell
total cycles in shared programs: 904869459 -> 904835501 (<.01%)
cycles in affected programs: 7686744 -> 7652786 (-0.44%)
helped: 2861 / HURT: 2050

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 141442369 -> 141442032 (-0.0%)
Instructions helped: 337

Cycles in all programs: 9099270231 -> 9099036492 (-0.0%)
Cycles helped: 40661
Cycles hurt: 28606

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>
This commit is contained in:
Ian Romanick
2022-07-12 15:32:01 -07:00
committed by Marge Bot
parent 5dab077824
commit 349a040f68
7 changed files with 174 additions and 119 deletions

View File

@@ -950,6 +950,17 @@ enum rt_logical_srcs {
RT_LOGICAL_NUM_SRCS
};
enum urb_logical_srcs {
URB_LOGICAL_SRC_HANDLE,
URB_LOGICAL_SRC_PER_SLOT_OFFSETS,
URB_LOGICAL_SRC_CHANNEL_MASK,
/** Data to be written. BAD_FILE for reads. */
URB_LOGICAL_SRC_DATA,
URB_LOGICAL_NUM_SRCS
};
#ifdef __cplusplus
/**
* Allow brw_urb_write_flags enums to be ORed together.

View File

@@ -863,6 +863,17 @@ fs_inst::components_read(unsigned i) const
return 1;
}
case SHADER_OPCODE_URB_WRITE_LOGICAL:
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
if (i == URB_LOGICAL_SRC_DATA)
return mlen - 1 -
unsigned(src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) -
unsigned(src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
else
return 1;
default:
return 1;
}
@@ -891,10 +902,6 @@ fs_inst::size_read(int arg) const
break;
case FS_OPCODE_FB_READ:
case SHADER_OPCODE_URB_WRITE_LOGICAL:
case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
case SHADER_OPCODE_URB_READ_LOGICAL:
case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -1546,17 +1553,17 @@ fs_visitor::emit_gs_thread_end()
break;
}
}
fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, hdr);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->mlen = 1;
} else {
fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
sources[1] = this->final_gs_vertex_count;
abld.LOAD_PAYLOAD(payload, sources, 2, 2);
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->mlen = 2;
}
inst->eot = true;
@@ -6676,16 +6683,12 @@ fs_visitor::run_tcs()
}
/* Emit EOT write; set TR DS Cache bit */
fs_reg srcs[3] = {
fs_reg(get_tcs_output_urb_handle()),
fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
fs_reg(brw_imm_ud(0)),
};
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
bld.null_reg_ud(), payload);
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 3;
inst->eot = true;

View File

@@ -2341,27 +2341,27 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
}
/* Store the control data bits in the message payload and send it. */
unsigned mlen = 2;
if (channel_mask.file != BAD_FILE)
mlen += 4; /* channel masks, plus 3 extra copies of the data */
if (per_slot_offset.file != BAD_FILE)
mlen++;
const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) +
unsigned(per_slot_offset.file != BAD_FILE);
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
unsigned i = 0;
sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
if (per_slot_offset.file != BAD_FILE)
sources[i++] = per_slot_offset;
if (channel_mask.file != BAD_FILE)
sources[i++] = channel_mask;
while (i < mlen) {
sources[i++] = this->control_data_bits;
}
/* If there are channel masks, add 3 extra copies of the data. */
const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
inst->mlen = mlen;
fs_reg sources[4];
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = this->control_data_bits;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = header_size + length;
/* We need to increment Global Offset by 256-bits to make room for
* Broadwell's extra "Vertex Count" payload at the beginning of the
* URB entry. Since this is an OWord message, Global Offset is counted
@@ -3046,15 +3046,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0];
unsigned mask = instr->const_index[1];
unsigned header_regs = 0;
struct brw_reg output_handles = get_tcs_output_urb_handle();
fs_reg srcs[7];
srcs[header_regs++] = output_handles;
if (indirect_offset.file != BAD_FILE) {
srcs[header_regs++] = indirect_offset;
}
if (mask == 0)
break;
@@ -3068,8 +3059,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
unsigned first_component = nir_intrinsic_component(instr);
mask = mask << first_component;
fs_reg mask_reg;
if (mask != WRITEMASK_XYZW) {
srcs[header_regs++] = brw_imm_ud(mask << 16);
mask_reg = brw_imm_ud(mask << 16);
opcode = indirect_offset.file != BAD_FILE ?
SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL :
SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL;
@@ -3079,21 +3071,30 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
SHADER_OPCODE_URB_WRITE_LOGICAL;
}
fs_reg sources[4];
for (unsigned i = 0; i < num_components; i++) {
if (!(mask & (1 << (i + first_component))))
continue;
srcs[header_regs + i + first_component] = offset(value, bld, i);
sources[i + first_component] = offset(value, bld, i);
}
unsigned mlen = header_regs + num_components + first_component;
fs_reg payload =
bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) +
unsigned(mask != WRITEMASK_XYZW);
const unsigned length = num_components + first_component;
fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
fs_inst *inst = bld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
inst->offset = imm_offset;
inst->mlen = mlen;
inst->mlen = header_size + length;
break;
}

View File

@@ -43,6 +43,20 @@ fs_visitor::validate()
{
#ifndef NDEBUG
foreach_block_and_inst (block, fs_inst, inst, cfg) {
if (inst->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
const unsigned header_size = 1 +
unsigned(inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) +
unsigned(inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
unsigned data_size = 0;
for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) {
fsv_assert(type_sz(offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j).type) == 4);
data_size++;
}
fsv_assert(header_size + data_size == inst->mlen);
}
if (inst->dst.file == VGRF) {
fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
alloc.sizes[inst->dst.nr]);

View File

@@ -935,22 +935,15 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (length == 8 || (length > 0 && slot == last_slot))
flush = true;
if (flush) {
fs_reg *payload_sources =
ralloc_array(mem_ctx, fs_reg, length + header_size);
fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
BRW_REGISTER_TYPE_F);
payload_sources[0] = urb_handle;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
if (opcode == SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL)
payload_sources[1] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
BRW_REGISTER_TYPE_F);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
memcpy(&payload_sources[header_size], sources,
length * sizeof sources[0]);
abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
header_size);
fs_inst *inst = abld.emit(opcode, reg_undef, payload);
fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
/* For ICL WA 1805992985 one needs additional write in the end. */
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
@@ -985,10 +978,17 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
if (stage == MESA_SHADER_GEOMETRY)
return;
fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
bld.exec_all().MOV(payload, urb_handle);
fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
bld.exec_all().MOV(uniform_urb_handle, urb_handle);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->mlen = 2;
inst->offset = 1;
@@ -1002,14 +1002,16 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* all 8 lanes must valid.
*/
if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
/* Workaround requires all 8 channels (lanes) to be valid. This is
* understood to mean they all need to be alive. First trick is to find
* a live channel and copy its urb handle for all the other channels to
* make sure all handles are valid.
*/
bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
/* Second trick is to use masked URB write where one can tell the HW to
* actually write data only for selected channels even though all are
@@ -1025,14 +1027,19 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
* 4 slots data. All are explicitly zeros in order to to keep the MBZ
* area written as zeros.
*/
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
srcs[URB_LOGICAL_SRC_DATA] = payload;
fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload);
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->mlen = 6;
inst->offset = 0;

View File

@@ -73,8 +73,27 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
assert(inst->header_size == 0);
fs_reg *payload_sources = new fs_reg[inst->mlen];
fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
BRW_REGISTER_TYPE_F);
unsigned header_size = 0;
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
if (per_slot_present)
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
if (channel_mask_present)
payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
delete [] payload_sources;
inst->opcode = SHADER_OPCODE_SEND;
inst->header_size = 1;
inst->header_size = header_size;
inst->dst = brw_null_reg();
inst->sfid = BRW_SFID_URB;
@@ -88,13 +107,11 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
inst->ex_mlen = 0;
inst->send_has_side_effects = true;
fs_reg tmp = inst->src[0];
inst->resize_sources(4);
inst->src[0] = brw_imm_ud(0); /* desc */
inst->src[1] = brw_imm_ud(0); /* ex_desc */
inst->src[2] = tmp;
inst->src[2] = payload;
inst->src[3] = brw_null_reg();
}

View File

@@ -892,25 +892,25 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
fs_builder bld8 = bld.group(8, q);
fs_reg payload_srcs[6];
unsigned p = 0;
payload_srcs[p++] = urb_handle;
payload_srcs[p++] = brw_imm_ud(first_mask << 16);
const unsigned header_size = p;
fs_reg payload_srcs[4];
unsigned length = 0;
for (unsigned i = 0; i < comp_shift; i++)
payload_srcs[p++] = reg_undef;
payload_srcs[length++] = reg_undef;
for (unsigned c = 0; c < first_comps; c++)
payload_srcs[p++] = quarter(offset(src, bld, c), q);
payload_srcs[length++] = quarter(offset(src, bld, c), q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p);
bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16);
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload);
inst->mlen = p;
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 2 + length;
inst->offset = urb_global_offset;
assert(inst->offset < 2048);
}
@@ -923,22 +923,22 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
fs_builder bld8 = bld.group(8, q);
fs_reg payload_srcs[6];
unsigned p = 0;
payload_srcs[p++] = urb_handle;
payload_srcs[p++] = brw_imm_ud(second_mask << 16);
const unsigned header_size = p;
fs_reg payload_srcs[4];
unsigned length = 0;
for (unsigned c = 0; c < second_comps; c++)
payload_srcs[p++] = quarter(offset(src, bld, c + first_comps), q);
payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p);
bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16);
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, payload);
inst->mlen = p;
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 2 + length;
inst->offset = urb_global_offset;
assert(inst->offset < 2048);
}
@@ -988,21 +988,23 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
bld8.SHR(off, off, brw_imm_ud(2));
fs_reg payload_srcs[7];
int x = 0;
payload_srcs[x++] = urb_handle;
payload_srcs[x++] = off;
payload_srcs[x++] = mask;
fs_reg payload_srcs[4];
unsigned length = 0;
for (unsigned j = 0; j < 4; j++)
payload_srcs[x++] = quarter(src_comp, q);
payload_srcs[length++] = quarter(src_comp, q);
fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, x);
bld8.LOAD_PAYLOAD(payload, payload_srcs, x, 3);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
BRW_REGISTER_TYPE_F);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL,
reg_undef, payload);
inst->mlen = x;
fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->mlen = 3 + length;
inst->offset = 0;
}
}