i965/fs: Add support for nir_intrinsic_shuffle
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
@@ -451,6 +451,15 @@ enum opcode {
|
|||||||
*/
|
*/
|
||||||
SHADER_OPCODE_BROADCAST,
|
SHADER_OPCODE_BROADCAST,
|
||||||
|
|
||||||
|
/* Pick the channel from its first source register given by the index
|
||||||
|
* specified as second source.
|
||||||
|
*
|
||||||
|
* This is similar to the BROADCAST instruction except that it takes a
|
||||||
|
* dynamic index and potentially puts a different value in each output
|
||||||
|
* channel.
|
||||||
|
*/
|
||||||
|
SHADER_OPCODE_SHUFFLE,
|
||||||
|
|
||||||
SHADER_OPCODE_GET_BUFFER_SIZE,
|
SHADER_OPCODE_GET_BUFFER_SIZE,
|
||||||
|
|
||||||
VEC4_OPCODE_MOV_BYTES,
|
VEC4_OPCODE_MOV_BYTES,
|
||||||
|
@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
|
|||||||
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
||||||
/* Multiple partial writes to the destination */
|
/* Multiple partial writes to the destination */
|
||||||
return true;
|
return true;
|
||||||
|
case SHADER_OPCODE_SHUFFLE:
|
||||||
|
/* This instruction returns an arbitrary channel from the source and
|
||||||
|
* gets split into smaller instructions in the generator. It's possible
|
||||||
|
* that one of the instructions will read from a channel corresponding
|
||||||
|
* to an earlier instruction.
|
||||||
|
*/
|
||||||
|
return true;
|
||||||
default:
|
default:
|
||||||
/* The SIMD16 compressed instruction
|
/* The SIMD16 compressed instruction
|
||||||
*
|
*
|
||||||
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SHADER_OPCODE_SHUFFLE:
|
||||||
|
if (is_uniform(inst->src[0])) {
|
||||||
|
inst->opcode = BRW_OPCODE_MOV;
|
||||||
|
inst->sources = 1;
|
||||||
|
progress = true;
|
||||||
|
} else if (inst->src[1].file == IMM) {
|
||||||
|
inst->opcode = BRW_OPCODE_MOV;
|
||||||
|
inst->src[0] = component(inst->src[0],
|
||||||
|
inst->src[1].ud);
|
||||||
|
inst->sources = 1;
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@@ -471,6 +471,11 @@ private:
|
|||||||
struct brw_reg reg,
|
struct brw_reg reg,
|
||||||
struct brw_reg indirect_byte_offset);
|
struct brw_reg indirect_byte_offset);
|
||||||
|
|
||||||
|
void generate_shuffle(fs_inst *inst,
|
||||||
|
struct brw_reg dst,
|
||||||
|
struct brw_reg src,
|
||||||
|
struct brw_reg idx);
|
||||||
|
|
||||||
bool patch_discard_jumps_to_fb_writes();
|
bool patch_discard_jumps_to_fb_writes();
|
||||||
|
|
||||||
const struct brw_compiler *compiler;
|
const struct brw_compiler *compiler;
|
||||||
|
@@ -540,6 +540,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
fs_generator::generate_shuffle(fs_inst *inst,
|
||||||
|
struct brw_reg dst,
|
||||||
|
struct brw_reg src,
|
||||||
|
struct brw_reg idx)
|
||||||
|
{
|
||||||
|
/* Ivy bridge has some strange behavior that makes this a real pain to
|
||||||
|
* implement for 64-bit values so we just don't bother.
|
||||||
|
*/
|
||||||
|
assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
|
||||||
|
|
||||||
|
/* Because we're using the address register, we're limited to 8-wide
|
||||||
|
* execution on gen7. On gen8, we're limited to 16-wide by the address
|
||||||
|
* register file and 8-wide for 64-bit types. We could try and make this
|
||||||
|
* instruction splittable higher up in the compiler but that gets weird
|
||||||
|
* because it reads all of the channels regardless of execution size. It's
|
||||||
|
* easier just to split it here.
|
||||||
|
*/
|
||||||
|
const unsigned lower_width =
|
||||||
|
(devinfo->gen <= 7 || type_sz(src.type) > 4) ?
|
||||||
|
8 : MIN2(16, inst->exec_size);
|
||||||
|
|
||||||
|
brw_set_default_exec_size(p, cvt(lower_width) - 1);
|
||||||
|
for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
|
||||||
|
brw_set_default_group(p, group);
|
||||||
|
|
||||||
|
if ((src.vstride == 0 && src.hstride == 0) ||
|
||||||
|
idx.file == BRW_IMMEDIATE_VALUE) {
|
||||||
|
/* Trivial, the source is already uniform or the index is a constant.
|
||||||
|
* We will typically not get here if the optimizer is doing its job,
|
||||||
|
* but asserting would be mean.
|
||||||
|
*/
|
||||||
|
const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
|
||||||
|
brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
|
||||||
|
} else {
|
||||||
|
/* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
|
||||||
|
struct brw_reg addr = vec8(brw_address_reg(0));
|
||||||
|
|
||||||
|
struct brw_reg group_idx = suboffset(idx, group);
|
||||||
|
|
||||||
|
if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
|
||||||
|
/* Things get grumpy if the register is too wide. */
|
||||||
|
group_idx.width--;
|
||||||
|
group_idx.vstride--;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(type_sz(group_idx.type) <= 4);
|
||||||
|
if (type_sz(group_idx.type) == 4) {
|
||||||
|
/* The destination stride of an instruction (in bytes) must be
|
||||||
|
* greater than or equal to the size of the rest of the
|
||||||
|
* instruction. Since the address register is of type UW, we
|
||||||
|
* can't use a D-type instruction. In order to get around this,
|
||||||
|
* re retype to UW and use a stride.
|
||||||
|
*/
|
||||||
|
group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Take into account the component size and horizontal stride. */
|
||||||
|
assert(src.vstride == src.hstride + src.width);
|
||||||
|
brw_SHL(p, addr, group_idx,
|
||||||
|
brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
|
||||||
|
src.hstride - 1));
|
||||||
|
|
||||||
|
/* Add on the register start offset */
|
||||||
|
brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
|
||||||
|
|
||||||
|
if (type_sz(src.type) > 4 &&
|
||||||
|
((devinfo->gen == 7 && !devinfo->is_haswell) ||
|
||||||
|
devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
|
||||||
|
/* IVB has an issue (which we found empirically) where it reads
|
||||||
|
* two address register components per channel for indirectly
|
||||||
|
* addressed 64-bit sources.
|
||||||
|
*
|
||||||
|
* From the Cherryview PRM Vol 7. "Register Region Restrictions":
|
||||||
|
*
|
||||||
|
* "When source or destination datatype is 64b or operation is
|
||||||
|
* integer DWord multiply, indirect addressing must not be
|
||||||
|
* used."
|
||||||
|
*
|
||||||
|
* To work around both of these, we do two integer MOVs insead of
|
||||||
|
* one 64-bit MOV. Because no double value should ever cross a
|
||||||
|
* register boundary, it's safe to use the immediate offset in the
|
||||||
|
* indirect here to handle adding 4 bytes to the offset and avoid
|
||||||
|
* the extra ADD to the register file.
|
||||||
|
*/
|
||||||
|
struct brw_reg gdst = suboffset(dst, group);
|
||||||
|
struct brw_reg dst_d = retype(spread(gdst, 2),
|
||||||
|
BRW_REGISTER_TYPE_D);
|
||||||
|
brw_MOV(p, dst_d,
|
||||||
|
retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
|
||||||
|
brw_MOV(p, byte_offset(dst_d, 4),
|
||||||
|
retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
|
||||||
|
} else {
|
||||||
|
brw_MOV(p, suboffset(dst, group),
|
||||||
|
retype(brw_VxH_indirect(0, 0), src.type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_generator::generate_urb_read(fs_inst *inst,
|
fs_generator::generate_urb_read(fs_inst *inst,
|
||||||
struct brw_reg dst,
|
struct brw_reg dst,
|
||||||
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
|
|||||||
brw_broadcast(p, dst, src[0], src[1]);
|
brw_broadcast(p, dst, src[0], src[1]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SHADER_OPCODE_SHUFFLE:
|
||||||
|
generate_shuffle(inst, dst, src[0], src[1]);
|
||||||
|
break;
|
||||||
|
|
||||||
case FS_OPCODE_SET_SAMPLE_ID:
|
case FS_OPCODE_SET_SAMPLE_ID:
|
||||||
generate_set_sample_id(inst, dst, src[0], src[1]);
|
generate_set_sample_id(inst, dst, src[0], src[1]);
|
||||||
break;
|
break;
|
||||||
|
@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_shuffle: {
|
||||||
|
const fs_reg value = get_nir_src(instr->src[0]);
|
||||||
|
const fs_reg index = get_nir_src(instr->src[1]);
|
||||||
|
|
||||||
|
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case nir_intrinsic_first_invocation: {
|
case nir_intrinsic_first_invocation: {
|
||||||
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||||
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
|
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
|
||||||
|
@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
|
|||||||
.lower_to_scalar = true,
|
.lower_to_scalar = true,
|
||||||
.lower_subgroup_masks = true,
|
.lower_subgroup_masks = true,
|
||||||
.lower_vote_trivial = !is_scalar,
|
.lower_vote_trivial = !is_scalar,
|
||||||
|
.lower_shuffle = true,
|
||||||
};
|
};
|
||||||
OPT(nir_lower_subgroups, &subgroups_options);
|
OPT(nir_lower_subgroups, &subgroups_options);
|
||||||
|
|
||||||
|
@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
|
|||||||
return "find_live_channel";
|
return "find_live_channel";
|
||||||
case SHADER_OPCODE_BROADCAST:
|
case SHADER_OPCODE_BROADCAST:
|
||||||
return "broadcast";
|
return "broadcast";
|
||||||
|
case SHADER_OPCODE_SHUFFLE:
|
||||||
|
return "shuffle";
|
||||||
|
|
||||||
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
||||||
return "get_buffer_size";
|
return "get_buffer_size";
|
||||||
|
Reference in New Issue
Block a user