intel/fs: Implement quad swizzles on ICL+.
Align16 is no longer a thing, so a new implementation is provided using Align1 instead. Not all possible swizzles can be represented as a single Align1 region, but some fast paths are provided for frequently used swizzles that can be represented efficiently in Align1 mode. Fixes ~90 subgroup quad swap Vulkan CTS tests. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
@@ -315,6 +315,24 @@ fs_inst::has_source_and_destination_hazard() const
|
|||||||
* may stomp all over it.
|
* may stomp all over it.
|
||||||
*/
|
*/
|
||||||
return true;
|
return true;
|
||||||
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||||
|
switch (src[1].ud) {
|
||||||
|
case BRW_SWIZZLE_XXXX:
|
||||||
|
case BRW_SWIZZLE_YYYY:
|
||||||
|
case BRW_SWIZZLE_ZZZZ:
|
||||||
|
case BRW_SWIZZLE_WWWW:
|
||||||
|
case BRW_SWIZZLE_XXZZ:
|
||||||
|
case BRW_SWIZZLE_YYWW:
|
||||||
|
case BRW_SWIZZLE_XYXY:
|
||||||
|
case BRW_SWIZZLE_ZWZW:
|
||||||
|
/* These can be implemented as a single Align1 region on all
|
||||||
|
* platforms, so there's never a hazard between source and
|
||||||
|
* destination. C.f. fs_generator::generate_quad_swizzle().
|
||||||
|
*/
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
return !is_uniform(src[0]);
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
/* The SIMD16 compressed instruction
|
/* The SIMD16 compressed instruction
|
||||||
*
|
*
|
||||||
@@ -5579,9 +5597,14 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
|
|||||||
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
||||||
return MIN2(8, inst->exec_size);
|
return MIN2(8, inst->exec_size);
|
||||||
|
|
||||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
case SHADER_OPCODE_QUAD_SWIZZLE: {
|
||||||
return 8;
|
const unsigned swiz = inst->src[1].ud;
|
||||||
|
return (is_uniform(inst->src[0]) ?
|
||||||
|
get_fpu_lowered_simd_width(devinfo, inst) :
|
||||||
|
devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
|
||||||
|
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
|
||||||
|
get_fpu_lowered_simd_width(devinfo, inst));
|
||||||
|
}
|
||||||
case SHADER_OPCODE_MOV_INDIRECT: {
|
case SHADER_OPCODE_MOV_INDIRECT: {
|
||||||
/* From IVB and HSW PRMs:
|
/* From IVB and HSW PRMs:
|
||||||
*
|
*
|
||||||
|
@@ -480,6 +480,10 @@ private:
|
|||||||
struct brw_reg src,
|
struct brw_reg src,
|
||||||
struct brw_reg idx);
|
struct brw_reg idx);
|
||||||
|
|
||||||
|
void generate_quad_swizzle(const fs_inst *inst,
|
||||||
|
struct brw_reg dst, struct brw_reg src,
|
||||||
|
unsigned swiz);
|
||||||
|
|
||||||
bool patch_discard_jumps_to_fb_writes();
|
bool patch_discard_jumps_to_fb_writes();
|
||||||
|
|
||||||
const struct brw_compiler *compiler;
|
const struct brw_compiler *compiler;
|
||||||
|
@@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
fs_generator::generate_quad_swizzle(const fs_inst *inst,
|
||||||
|
struct brw_reg dst, struct brw_reg src,
|
||||||
|
unsigned swiz)
|
||||||
|
{
|
||||||
|
/* Requires a quad. */
|
||||||
|
assert(inst->exec_size >= 4);
|
||||||
|
|
||||||
|
if (src.file == BRW_IMMEDIATE_VALUE ||
|
||||||
|
has_scalar_region(src)) {
|
||||||
|
/* The value is uniform across all channels */
|
||||||
|
brw_MOV(p, dst, src);
|
||||||
|
|
||||||
|
} else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
|
||||||
|
/* This only works on 8-wide 32-bit values */
|
||||||
|
assert(inst->exec_size == 8);
|
||||||
|
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
||||||
|
assert(src.vstride == src.width + 1);
|
||||||
|
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
||||||
|
struct brw_reg swiz_src = stride(src, 4, 4, 1);
|
||||||
|
swiz_src.swizzle = swiz;
|
||||||
|
brw_MOV(p, dst, swiz_src);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
|
||||||
|
assert(src.vstride == src.width + 1);
|
||||||
|
const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
|
||||||
|
|
||||||
|
switch (swiz) {
|
||||||
|
case BRW_SWIZZLE_XXXX:
|
||||||
|
case BRW_SWIZZLE_YYYY:
|
||||||
|
case BRW_SWIZZLE_ZZZZ:
|
||||||
|
case BRW_SWIZZLE_WWWW:
|
||||||
|
brw_MOV(p, dst, stride(src_0, 4, 4, 0));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BRW_SWIZZLE_XXZZ:
|
||||||
|
case BRW_SWIZZLE_YYWW:
|
||||||
|
brw_MOV(p, dst, stride(src_0, 2, 2, 0));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case BRW_SWIZZLE_XYXY:
|
||||||
|
case BRW_SWIZZLE_ZWZW:
|
||||||
|
assert(inst->exec_size == 4);
|
||||||
|
brw_MOV(p, dst, stride(src_0, 0, 2, 1));
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
assert(inst->force_writemask_all);
|
||||||
|
brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
|
||||||
|
|
||||||
|
for (unsigned c = 0; c < 4; c++) {
|
||||||
|
brw_inst *insn = brw_MOV(
|
||||||
|
p, stride(suboffset(dst, c),
|
||||||
|
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
|
||||||
|
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
|
||||||
|
|
||||||
|
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
|
||||||
|
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
fs_generator::generate_urb_read(fs_inst *inst,
|
fs_generator::generate_urb_read(fs_inst *inst,
|
||||||
struct brw_reg dst,
|
struct brw_reg dst,
|
||||||
@@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_QUAD_SWIZZLE:
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
||||||
/* This only works on 8-wide 32-bit values */
|
|
||||||
assert(inst->exec_size == 8);
|
|
||||||
assert(type_sz(src[0].type) == 4);
|
|
||||||
assert(inst->force_writemask_all);
|
|
||||||
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
assert(src[1].file == BRW_IMMEDIATE_VALUE);
|
||||||
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
assert(src[1].type == BRW_REGISTER_TYPE_UD);
|
||||||
|
generate_quad_swizzle(inst, dst, src[0], src[1].ud);
|
||||||
if (src[0].file == BRW_IMMEDIATE_VALUE ||
|
|
||||||
(src[0].vstride == 0 && src[0].hstride == 0)) {
|
|
||||||
/* The value is uniform across all channels */
|
|
||||||
brw_MOV(p, dst, src[0]);
|
|
||||||
} else {
|
|
||||||
brw_set_default_access_mode(p, BRW_ALIGN_16);
|
|
||||||
struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
|
|
||||||
swiz_src.swizzle = inst->src[1].ud;
|
|
||||||
brw_MOV(p, dst, swiz_src);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
case SHADER_OPCODE_CLUSTER_BROADCAST: {
|
||||||
|
Reference in New Issue
Block a user