intel/fs: Add a helper for emitting scan operations
This commit adds a helper to the builder for emitting "scan" operations. Given a binary operation #, a scan takes the vector [a0, a1, ..., aN] and returns the vector [a0, a0 # a1, ..., a0 # a1 # ... # aN] where each channel contains the combination of all previous channels. The sequence of instructions to perform the scan is fairly optimal; a 16-wide scan on a 32-bit type is only 6 instructions. The subgroup scan and reduction operations will be implemented in terms of this. Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
@@ -412,6 +412,115 @@ namespace brw {
|
||||
return src_reg(component(dst, 0));
|
||||
}
|
||||
|
||||
void
|
||||
emit_scan(enum opcode opcode, const dst_reg &tmp,
|
||||
unsigned cluster_size, brw_conditional_mod mod) const
|
||||
{
|
||||
assert(dispatch_width() >= 8);
|
||||
|
||||
/* The instruction splitting code isn't advanced enough to split
|
||||
* these so we need to handle that ourselves.
|
||||
*/
|
||||
if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
|
||||
const unsigned half_width = dispatch_width() / 2;
|
||||
const fs_builder ubld = exec_all().group(half_width, 0);
|
||||
dst_reg left = tmp;
|
||||
dst_reg right = horiz_offset(tmp, half_width);
|
||||
ubld.emit_scan(opcode, left, cluster_size, mod);
|
||||
ubld.emit_scan(opcode, right, cluster_size, mod);
|
||||
if (cluster_size > half_width) {
|
||||
src_reg left_comp = component(left, half_width - 1);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (cluster_size > 1) {
|
||||
const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
|
||||
dst_reg left = horiz_stride(tmp, 2);
|
||||
dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
|
||||
|
||||
/* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
|
||||
*
|
||||
* "When source or destination datatype is 64b or operation is
|
||||
* integer DWord multiply, regioning in Align1 must follow
|
||||
* these rules:
|
||||
*
|
||||
* [...]
|
||||
*
|
||||
* 3. Source and Destination offset must be the same, except
|
||||
* the case of scalar source."
|
||||
*
|
||||
* In order to work around this, we create a temporary register
|
||||
* and shift left over to match right. If we have a 64-bit type,
|
||||
* we have to use two integer MOVs instead of a 64-bit MOV.
|
||||
*/
|
||||
if (need_matching_subreg_offset(opcode, tmp.type)) {
|
||||
dst_reg tmp2 = vgrf(tmp.type);
|
||||
dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2);
|
||||
if (type_sz(tmp.type) > 4) {
|
||||
ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0),
|
||||
subscript(left, BRW_REGISTER_TYPE_D, 0));
|
||||
ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1),
|
||||
subscript(left, BRW_REGISTER_TYPE_D, 1));
|
||||
} else {
|
||||
ubld.MOV(new_left, left);
|
||||
}
|
||||
left = new_left;
|
||||
}
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
}
|
||||
|
||||
if (cluster_size > 2) {
|
||||
if (type_sz(tmp.type) <= 4 &&
|
||||
!need_matching_subreg_offset(opcode, tmp.type)) {
|
||||
const fs_builder ubld =
|
||||
exec_all().group(dispatch_width() / 4, 0);
|
||||
src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
|
||||
|
||||
dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
|
||||
right = horiz_stride(horiz_offset(tmp, 3), 4);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
} else {
|
||||
/* For 64-bit types, we have to do things differently because
|
||||
* the code above would land us with destination strides that
|
||||
* the hardware can't handle. Fortunately, we'll only be
|
||||
* 8-wide in that case and it's the same number of
|
||||
* instructions.
|
||||
*/
|
||||
const fs_builder ubld = exec_all().group(2, 0);
|
||||
|
||||
for (unsigned i = 0; i < dispatch_width(); i += 4) {
|
||||
src_reg left = component(tmp, i + 1);
|
||||
dst_reg right = horiz_offset(tmp, i + 2);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cluster_size > 4) {
|
||||
const fs_builder ubld = exec_all().group(4, 0);
|
||||
src_reg left = component(tmp, 3);
|
||||
dst_reg right = horiz_offset(tmp, 4);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
|
||||
if (dispatch_width() > 8) {
|
||||
left = component(tmp, 8 + 3);
|
||||
right = horiz_offset(tmp, 8 + 4);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
if (cluster_size > 8 && dispatch_width() > 8) {
|
||||
const fs_builder ubld = exec_all().group(8, 0);
|
||||
src_reg left = component(tmp, 7);
|
||||
dst_reg right = horiz_offset(tmp, 8);
|
||||
set_condmod(mod, ubld.emit(opcode, right, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assorted arithmetic ops.
|
||||
* @{
|
||||
@@ -644,6 +753,38 @@ namespace brw {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
|
||||
*
|
||||
* "When source or destination datatype is 64b or operation is
|
||||
* integer DWord multiply, regioning in Align1 must follow
|
||||
* these rules:
|
||||
*
|
||||
* [...]
|
||||
*
|
||||
* 3. Source and Destination offset must be the same, except
|
||||
* the case of scalar source."
|
||||
*
|
||||
* This helper just detects when we're in this case.
|
||||
*/
|
||||
bool
|
||||
need_matching_subreg_offset(enum opcode opcode,
|
||||
enum brw_reg_type type) const
|
||||
{
|
||||
if (!shader->devinfo->is_cherryview &&
|
||||
!gen_device_info_is_9lp(shader->devinfo))
|
||||
return false;
|
||||
|
||||
if (type_sz(type > 4))
|
||||
return true;
|
||||
|
||||
if (opcode == BRW_OPCODE_MUL &&
|
||||
!brw_reg_type_is_floating_point(type))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bblock_t *block;
|
||||
exec_node *cursor;
|
||||
|
||||
|
@@ -312,6 +312,13 @@ subscript(fs_reg reg, brw_reg_type type, unsigned i)
|
||||
return byte_offset(retype(reg, type), i * type_sz(type));
|
||||
}
|
||||
|
||||
static inline fs_reg
|
||||
horiz_stride(fs_reg reg, unsigned s)
|
||||
{
|
||||
reg.stride *= s;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static const fs_reg reg_undef;
|
||||
|
||||
class fs_inst : public backend_instruction {
|
||||
|
Reference in New Issue
Block a user