intel/fs: Add support for subgroup quad operations
NIR has code to lower these away for us but we can do significantly better in many cases with register regioning and SIMD4x2. Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
@@ -467,6 +467,11 @@ enum opcode {
|
|||||||
*/
|
*/
|
||||||
SHADER_OPCODE_SEL_EXEC,
|
SHADER_OPCODE_SEL_EXEC,
|
||||||
|
|
||||||
|
/* This turns into an align16 mov from src0 to dst with a swizzle
|
||||||
|
* provided as an immediate in src1.
|
||||||
|
*/
|
||||||
|
SHADER_OPCODE_QUAD_SWIZZLE,
|
||||||
|
|
||||||
/* Take every Nth element in src0 and broadcast it to the group of N
|
/* Take every Nth element in src0 and broadcast it to the group of N
|
||||||
* channels in which it lives in the destination. The offset within the
|
* channels in which it lives in the destination. The offset within the
|
||||||
* cluster is given by src1 and the cluster size is given by src2.
|
* cluster is given by src1 and the cluster size is given by src2.
|
||||||
|
@@ -5233,6 +5233,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
|
|||||||
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
||||||
return MIN2(8, inst->exec_size);
|
return MIN2(8, inst->exec_size);
|
||||||
|
|
||||||
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||||
|
return 8;
|
||||||
|
|
||||||
case SHADER_OPCODE_MOV_INDIRECT: {
|
case SHADER_OPCODE_MOV_INDIRECT: {
|
||||||
/* From IVB and HSW PRMs:
|
/* From IVB and HSW PRMs:
|
||||||
*
|
*
|
||||||
|
@@ -2301,6 +2301,26 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
|
|||||||
brw_MOV(p, dst, src[0]);
|
brw_MOV(p, dst, src[0]);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||||
|
/* This only works on 8-wide 32-bit values */
|
||||||
|
assert(inst->exec_size == 8);
|
||||||
|
assert(type_sz(src[0].type) == 4);
|
||||||
|
assert(inst->force_writemask_all);
|
||||||
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
||||||
|
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
||||||
|
|
||||||
|
if (src[0].file == BRW_IMMEDIATE_VALUE ||
|
||||||
|
(src[0].vstride == 0 && src[0].hstride == 0)) {
|
||||||
|
/* The value is uniform across all channels */
|
||||||
|
brw_MOV(p, dst, src[0]);
|
||||||
|
} else {
|
||||||
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||||
|
struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
|
||||||
|
swiz_src.swizzle = inst->src[1].ud;
|
||||||
|
brw_MOV(p, dst, swiz_src);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
||||||
assert(src[0].type == dst.type);
|
assert(src[0].type == dst.type);
|
||||||
assert(!src[0].negate && !src[0].abs);
|
assert(!src[0].negate && !src[0].abs);
|
||||||
|
@@ -4588,6 +4588,100 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_quad_broadcast: {
|
||||||
|
const fs_reg value = get_nir_src(instr->src[0]);
|
||||||
|
nir_const_value *index = nir_src_as_const_value(instr->src[1]);
|
||||||
|
assert(nir_src_bit_size(instr->src[1]) == 32);
|
||||||
|
|
||||||
|
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
|
||||||
|
value, brw_imm_ud(index->u32[0]), brw_imm_ud(4));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_quad_swap_horizontal: {
|
||||||
|
const fs_reg value = get_nir_src(instr->src[0]);
|
||||||
|
const fs_reg tmp = bld.vgrf(value.type);
|
||||||
|
const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
|
||||||
|
|
||||||
|
const fs_reg src_left = horiz_stride(value, 2);
|
||||||
|
const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
|
||||||
|
const fs_reg tmp_left = horiz_stride(tmp, 2);
|
||||||
|
const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
|
||||||
|
|
||||||
|
/* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
|
||||||
|
*
|
||||||
|
* "When source or destination datatype is 64b or operation is
|
||||||
|
* integer DWord multiply, regioning in Align1 must follow
|
||||||
|
* these rules:
|
||||||
|
*
|
||||||
|
* [...]
|
||||||
|
*
|
||||||
|
* 3. Source and Destination offset must be the same, except
|
||||||
|
* the case of scalar source."
|
||||||
|
*
|
||||||
|
* In order to work around this, we have to emit two 32-bit MOVs instead
|
||||||
|
* of a single 64-bit MOV to do the shuffle.
|
||||||
|
*/
|
||||||
|
if (type_sz(value.type) > 4 &&
|
||||||
|
(devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
|
||||||
|
ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0),
|
||||||
|
subscript(src_right, BRW_REGISTER_TYPE_D, 0));
|
||||||
|
ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1),
|
||||||
|
subscript(src_right, BRW_REGISTER_TYPE_D, 1));
|
||||||
|
ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0),
|
||||||
|
subscript(src_left, BRW_REGISTER_TYPE_D, 0));
|
||||||
|
ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1),
|
||||||
|
subscript(src_left, BRW_REGISTER_TYPE_D, 1));
|
||||||
|
} else {
|
||||||
|
ubld.MOV(tmp_left, src_right);
|
||||||
|
ubld.MOV(tmp_right, src_left);
|
||||||
|
}
|
||||||
|
bld.MOV(retype(dest, value.type), tmp);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_quad_swap_vertical: {
|
||||||
|
const fs_reg value = get_nir_src(instr->src[0]);
|
||||||
|
if (nir_src_bit_size(instr->src[0]) == 32) {
|
||||||
|
/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
|
||||||
|
const fs_reg tmp = bld.vgrf(value.type);
|
||||||
|
const fs_builder ubld = bld.exec_all();
|
||||||
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
|
||||||
|
brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
|
||||||
|
bld.MOV(retype(dest, value.type), tmp);
|
||||||
|
} else {
|
||||||
|
/* For larger data types, we have to either emit dispatch_width many
|
||||||
|
* MOVs or else fall back to doing indirects.
|
||||||
|
*/
|
||||||
|
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
|
||||||
|
bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||||
|
brw_imm_w(0x2));
|
||||||
|
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case nir_intrinsic_quad_swap_diagonal: {
|
||||||
|
const fs_reg value = get_nir_src(instr->src[0]);
|
||||||
|
if (nir_src_bit_size(instr->src[0]) == 32) {
|
||||||
|
/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
|
||||||
|
const fs_reg tmp = bld.vgrf(value.type);
|
||||||
|
const fs_builder ubld = bld.exec_all();
|
||||||
|
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
|
||||||
|
brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
|
||||||
|
bld.MOV(retype(dest, value.type), tmp);
|
||||||
|
} else {
|
||||||
|
/* For larger data types, we have to either emit dispatch_width many
|
||||||
|
* MOVs or else fall back to doing indirects.
|
||||||
|
*/
|
||||||
|
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
|
||||||
|
bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
|
||||||
|
brw_imm_w(0x3));
|
||||||
|
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case nir_intrinsic_reduce: {
|
case nir_intrinsic_reduce: {
|
||||||
fs_reg src = get_nir_src(instr->src[0]);
|
fs_reg src = get_nir_src(instr->src[0]);
|
||||||
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
|
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
|
||||||
|
@@ -334,6 +334,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
|
|||||||
return "shuffle";
|
return "shuffle";
|
||||||
case SHADER_OPCODE_SEL_EXEC:
|
case SHADER_OPCODE_SEL_EXEC:
|
||||||
return "sel_exec";
|
return "sel_exec";
|
||||||
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||||
|
return "quad_swizzle";
|
||||||
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
||||||
return "cluster_broadcast";
|
return "cluster_broadcast";
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user