i965/fs: Add support for nir_intrinsic_shuffle

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
Jason Ekstrand
2017-08-29 09:21:32 -07:00
parent 8256ee3fa3
commit 90c9f29518
7 changed files with 150 additions and 0 deletions

View File

@@ -451,6 +451,15 @@ enum opcode {
*/
SHADER_OPCODE_BROADCAST,
/* Pick the channel from its first source register given by the index
* specified as second source.
*
* This is similar to the BROADCAST instruction except that it takes a
* dynamic index and potentially puts a different value in each output
* channel.
*/
SHADER_OPCODE_SHUFFLE,
SHADER_OPCODE_GET_BUFFER_SIZE,
VEC4_OPCODE_MOV_BYTES,

View File

@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
/* Multiple partial writes to the destination */
return true;
case SHADER_OPCODE_SHUFFLE:
/* This instruction returns an arbitrary channel from the source and
* gets split into smaller instructions in the generator. It's possible
* that one of the instructions will read from a channel corresponding
* to an earlier instruction.
*/
return true;
default:
/* The SIMD16 compressed instruction
*
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
}
break;
case SHADER_OPCODE_SHUFFLE:
if (is_uniform(inst->src[0])) {
inst->opcode = BRW_OPCODE_MOV;
inst->sources = 1;
progress = true;
} else if (inst->src[1].file == IMM) {
inst->opcode = BRW_OPCODE_MOV;
inst->src[0] = component(inst->src[0],
inst->src[1].ud);
inst->sources = 1;
progress = true;
}
break;
default:
break;
}

View File

@@ -471,6 +471,11 @@ private:
struct brw_reg reg,
struct brw_reg indirect_byte_offset);
void generate_shuffle(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg idx);
bool patch_discard_jumps_to_fb_writes();
const struct brw_compiler *compiler;

View File

@@ -540,6 +540,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
}
}
void
fs_generator::generate_shuffle(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src,
struct brw_reg idx)
{
/* Ivy bridge has some strange behavior that makes this a real pain to
* implement for 64-bit values so we just don't bother.
*/
assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
/* Because we're using the address register, we're limited to 8-wide
* execution on gen7. On gen8, we're limited to 16-wide by the address
* register file and 8-wide for 64-bit types. We could try and make this
* instruction splittable higher up in the compiler but that gets weird
* because it reads all of the channels regardless of execution size. It's
* easier just to split it here.
*/
const unsigned lower_width =
(devinfo->gen <= 7 || type_sz(src.type) > 4) ?
8 : MIN2(16, inst->exec_size);
brw_set_default_exec_size(p, cvt(lower_width) - 1);
for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
brw_set_default_group(p, group);
if ((src.vstride == 0 && src.hstride == 0) ||
idx.file == BRW_IMMEDIATE_VALUE) {
/* Trivial, the source is already uniform or the index is a constant.
* We will typically not get here if the optimizer is doing its job,
* but asserting would be mean.
*/
const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
} else {
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
struct brw_reg addr = vec8(brw_address_reg(0));
struct brw_reg group_idx = suboffset(idx, group);
if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
/* Things get grumpy if the register is too wide. */
group_idx.width--;
group_idx.vstride--;
}
assert(type_sz(group_idx.type) <= 4);
if (type_sz(group_idx.type) == 4) {
/* The destination stride of an instruction (in bytes) must be
* greater than or equal to the size of the rest of the
* instruction. Since the address register is of type UW, we
* can't use a D-type instruction. In order to get around this,
* re retype to UW and use a stride.
*/
group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
}
/* Take into account the component size and horizontal stride. */
assert(src.vstride == src.hstride + src.width);
brw_SHL(p, addr, group_idx,
brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
src.hstride - 1));
/* Add on the register start offset */
brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
if (type_sz(src.type) > 4 &&
((devinfo->gen == 7 && !devinfo->is_haswell) ||
devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
/* IVB has an issue (which we found empirically) where it reads
* two address register components per channel for indirectly
* addressed 64-bit sources.
*
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
*
* "When source or destination datatype is 64b or operation is
* integer DWord multiply, indirect addressing must not be
* used."
*
* To work around both of these, we do two integer MOVs insead of
* one 64-bit MOV. Because no double value should ever cross a
* register boundary, it's safe to use the immediate offset in the
* indirect here to handle adding 4 bytes to the offset and avoid
* the extra ADD to the register file.
*/
struct brw_reg gdst = suboffset(dst, group);
struct brw_reg dst_d = retype(spread(gdst, 2),
BRW_REGISTER_TYPE_D);
brw_MOV(p, dst_d,
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
brw_MOV(p, byte_offset(dst_d, 4),
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
} else {
brw_MOV(p, suboffset(dst, group),
retype(brw_VxH_indirect(0, 0), src.type));
}
}
}
}
void
fs_generator::generate_urb_read(fs_inst *inst,
struct brw_reg dst,
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
brw_broadcast(p, dst, src[0], src[1]);
break;
case SHADER_OPCODE_SHUFFLE:
generate_shuffle(inst, dst, src[0], src[1]);
break;
case FS_OPCODE_SET_SAMPLE_ID:
generate_set_sample_id(inst, dst, src[0], src[1]);
break;

View File

@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
case nir_intrinsic_shuffle: {
const fs_reg value = get_nir_src(instr->src[0]);
const fs_reg index = get_nir_src(instr->src[1]);
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
break;
}
case nir_intrinsic_first_invocation: {
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);

View File

@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_vote_trivial = !is_scalar,
.lower_shuffle = true,
};
OPT(nir_lower_subgroups, &subgroups_options);

View File

@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
return "find_live_channel";
case SHADER_OPCODE_BROADCAST:
return "broadcast";
case SHADER_OPCODE_SHUFFLE:
return "shuffle";
case SHADER_OPCODE_GET_BUFFER_SIZE:
return "get_buffer_size";