intel/fs: Implement quad swizzles on ICL+.
Align16 is no longer a thing, so a new implementation is provided using Align1 instead. Not all possible swizzles can be represented as a single Align1 region, but some fast paths are provided for frequently used swizzles that can be represented efficiently in Align1 mode. Fixes ~90 subgroup quad swap Vulkan CTS tests. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
@@ -315,6 +315,24 @@ fs_inst::has_source_and_destination_hazard() const
|
||||
* may stomp all over it.
|
||||
*/
|
||||
return true;
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
switch (src[1].ud) {
|
||||
case BRW_SWIZZLE_XXXX:
|
||||
case BRW_SWIZZLE_YYYY:
|
||||
case BRW_SWIZZLE_ZZZZ:
|
||||
case BRW_SWIZZLE_WWWW:
|
||||
case BRW_SWIZZLE_XXZZ:
|
||||
case BRW_SWIZZLE_YYWW:
|
||||
case BRW_SWIZZLE_XYXY:
|
||||
case BRW_SWIZZLE_ZWZW:
|
||||
/* These can be implemented as a single Align1 region on all
|
||||
* platforms, so there's never a hazard between source and
|
||||
* destination. C.f. fs_generator::generate_quad_swizzle().
|
||||
*/
|
||||
return false;
|
||||
default:
|
||||
return !is_uniform(src[0]);
|
||||
}
|
||||
default:
|
||||
/* The SIMD16 compressed instruction
|
||||
*
|
||||
@@ -5579,9 +5597,14 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
|
||||
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
||||
return MIN2(8, inst->exec_size);
|
||||
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
return 8;
|
||||
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE: {
|
||||
const unsigned swiz = inst->src[1].ud;
|
||||
return (is_uniform(inst->src[0]) ?
|
||||
get_fpu_lowered_simd_width(devinfo, inst) :
|
||||
devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
|
||||
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
|
||||
get_fpu_lowered_simd_width(devinfo, inst));
|
||||
}
|
||||
case SHADER_OPCODE_MOV_INDIRECT: {
|
||||
/* From IVB and HSW PRMs:
|
||||
*
|
||||
|
@@ -480,6 +480,10 @@ private:
|
||||
struct brw_reg src,
|
||||
struct brw_reg idx);
|
||||
|
||||
void generate_quad_swizzle(const fs_inst *inst,
|
||||
struct brw_reg dst, struct brw_reg src,
|
||||
unsigned swiz);
|
||||
|
||||
bool patch_discard_jumps_to_fb_writes();
|
||||
|
||||
const struct brw_compiler *compiler;
|
||||
|
@@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_generator::generate_quad_swizzle(const fs_inst *inst,
|
||||
struct brw_reg dst, struct brw_reg src,
|
||||
unsigned swiz)
|
||||
{
|
||||
/* Requires a quad. */
|
||||
assert(inst->exec_size >= 4);
|
||||
|
||||
if (src.file == BRW_IMMEDIATE_VALUE ||
|
||||
has_scalar_region(src)) {
|
||||
/* The value is uniform across all channels */
|
||||
brw_MOV(p, dst, src);
|
||||
|
||||
} else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
|
||||
/* This only works on 8-wide 32-bit values */
|
||||
assert(inst->exec_size == 8);
|
||||
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
||||
assert(src.vstride == src.width + 1);
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
struct brw_reg swiz_src = stride(src, 4, 4, 1);
|
||||
swiz_src.swizzle = swiz;
|
||||
brw_MOV(p, dst, swiz_src);
|
||||
|
||||
} else {
|
||||
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
||||
assert(src.vstride == src.width + 1);
|
||||
const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
|
||||
|
||||
switch (swiz) {
|
||||
case BRW_SWIZZLE_XXXX:
|
||||
case BRW_SWIZZLE_YYYY:
|
||||
case BRW_SWIZZLE_ZZZZ:
|
||||
case BRW_SWIZZLE_WWWW:
|
||||
brw_MOV(p, dst, stride(src_0, 4, 4, 0));
|
||||
break;
|
||||
|
||||
case BRW_SWIZZLE_XXZZ:
|
||||
case BRW_SWIZZLE_YYWW:
|
||||
brw_MOV(p, dst, stride(src_0, 2, 2, 0));
|
||||
break;
|
||||
|
||||
case BRW_SWIZZLE_XYXY:
|
||||
case BRW_SWIZZLE_ZWZW:
|
||||
assert(inst->exec_size == 4);
|
||||
brw_MOV(p, dst, stride(src_0, 0, 2, 1));
|
||||
break;
|
||||
|
||||
default:
|
||||
assert(inst->force_writemask_all);
|
||||
brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
|
||||
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
brw_inst *insn = brw_MOV(
|
||||
p, stride(suboffset(dst, c),
|
||||
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
|
||||
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
|
||||
|
||||
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
|
||||
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fs_generator::generate_urb_read(fs_inst *inst,
|
||||
struct brw_reg dst,
|
||||
@@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||
/* This only works on 8-wide 32-bit values */
|
||||
assert(inst->exec_size == 8);
|
||||
assert(type_sz(src[0].type) == 4);
|
||||
assert(inst->force_writemask_all);
|
||||
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
||||
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
||||
|
||||
if (src[0].file == BRW_IMMEDIATE_VALUE ||
|
||||
(src[0].vstride == 0 && src[0].hstride == 0)) {
|
||||
/* The value is uniform across all channels */
|
||||
brw_MOV(p, dst, src[0]);
|
||||
} else {
|
||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||
struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
|
||||
swiz_src.swizzle = inst->src[1].ud;
|
||||
brw_MOV(p, dst, swiz_src);
|
||||
}
|
||||
generate_quad_swizzle(inst, dst, src[0], src[1].ud);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
||||
|
Reference in New Issue
Block a user