intel/fs: Implement quad swizzles on ICL+.

Align16 is no longer a thing, so a new implementation is provided
using Align1 instead.  Not all possible swizzles can be represented as
a single Align1 region, but some fast paths are provided for
frequently used swizzles that can be represented efficiently in Align1
mode.

Fixes ~90 subgroup quad swap Vulkan CTS tests.

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
This commit is contained in:
Francisco Jerez
2018-12-06 14:11:34 -08:00
parent c5f9c0009d
commit 812ede088f
3 changed files with 97 additions and 18 deletions

View File

@@ -315,6 +315,24 @@ fs_inst::has_source_and_destination_hazard() const
* may stomp all over it. * may stomp all over it.
*/ */
return true; return true;
case SHADER_OPCODE_QUAD_SWIZZLE:
switch (src[1].ud) {
case BRW_SWIZZLE_XXXX:
case BRW_SWIZZLE_YYYY:
case BRW_SWIZZLE_ZZZZ:
case BRW_SWIZZLE_WWWW:
case BRW_SWIZZLE_XXZZ:
case BRW_SWIZZLE_YYWW:
case BRW_SWIZZLE_XYXY:
case BRW_SWIZZLE_ZWZW:
/* These can be implemented as a single Align1 region on all
* platforms, so there's never a hazard between source and
* destination. C.f. fs_generator::generate_quad_swizzle().
*/
return false;
default:
return !is_uniform(src[0]);
}
default: default:
/* The SIMD16 compressed instruction /* The SIMD16 compressed instruction
* *
@@ -5579,9 +5597,14 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
return MIN2(8, inst->exec_size); return MIN2(8, inst->exec_size);
case SHADER_OPCODE_QUAD_SWIZZLE: case SHADER_OPCODE_QUAD_SWIZZLE: {
return 8; const unsigned swiz = inst->src[1].ud;
return (is_uniform(inst->src[0]) ?
get_fpu_lowered_simd_width(devinfo, inst) :
devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
get_fpu_lowered_simd_width(devinfo, inst));
}
case SHADER_OPCODE_MOV_INDIRECT: { case SHADER_OPCODE_MOV_INDIRECT: {
/* From IVB and HSW PRMs: /* From IVB and HSW PRMs:
* *

View File

@@ -480,6 +480,10 @@ private:
struct brw_reg src, struct brw_reg src,
struct brw_reg idx); struct brw_reg idx);
void generate_quad_swizzle(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src,
unsigned swiz);
bool patch_discard_jumps_to_fb_writes(); bool patch_discard_jumps_to_fb_writes();
const struct brw_compiler *compiler; const struct brw_compiler *compiler;

View File

@@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
} }
} }
void
fs_generator::generate_quad_swizzle(const fs_inst *inst,
struct brw_reg dst, struct brw_reg src,
unsigned swiz)
{
/* Requires a quad. */
assert(inst->exec_size >= 4);
if (src.file == BRW_IMMEDIATE_VALUE ||
has_scalar_region(src)) {
/* The value is uniform across all channels */
brw_MOV(p, dst, src);
} else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
/* This only works on 8-wide 32-bit values */
assert(inst->exec_size == 8);
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
assert(src.vstride == src.width + 1);
brw_set_default_access_mode(p, BRW_ALIGN_16);
struct brw_reg swiz_src = stride(src, 4, 4, 1);
swiz_src.swizzle = swiz;
brw_MOV(p, dst, swiz_src);
} else {
assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
assert(src.vstride == src.width + 1);
const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
switch (swiz) {
case BRW_SWIZZLE_XXXX:
case BRW_SWIZZLE_YYYY:
case BRW_SWIZZLE_ZZZZ:
case BRW_SWIZZLE_WWWW:
brw_MOV(p, dst, stride(src_0, 4, 4, 0));
break;
case BRW_SWIZZLE_XXZZ:
case BRW_SWIZZLE_YYWW:
brw_MOV(p, dst, stride(src_0, 2, 2, 0));
break;
case BRW_SWIZZLE_XYXY:
case BRW_SWIZZLE_ZWZW:
assert(inst->exec_size == 4);
brw_MOV(p, dst, stride(src_0, 0, 2, 1));
break;
default:
assert(inst->force_writemask_all);
brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
for (unsigned c = 0; c < 4; c++) {
brw_inst *insn = brw_MOV(
p, stride(suboffset(dst, c),
4 * inst->dst.stride, 1, 4 * inst->dst.stride),
stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
brw_inst_set_no_dd_check(devinfo, insn, c > 0);
}
break;
}
}
}
void void
fs_generator::generate_urb_read(fs_inst *inst, fs_generator::generate_urb_read(fs_inst *inst,
struct brw_reg dst, struct brw_reg dst,
@@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
break; break;
case SHADER_OPCODE_QUAD_SWIZZLE: case SHADER_OPCODE_QUAD_SWIZZLE:
/* This only works on 8-wide 32-bit values */
assert(inst->exec_size == 8);
assert(type_sz(src[0].type) == 4);
assert(inst->force_writemask_all);
assert(src[1].file == BRW_IMMEDIATE_VALUE); assert(src[1].file == BRW_IMMEDIATE_VALUE);
assert(src[1].type == BRW_REGISTER_TYPE_UD); assert(src[1].type == BRW_REGISTER_TYPE_UD);
generate_quad_swizzle(inst, dst, src[0], src[1].ud);
if (src[0].file == BRW_IMMEDIATE_VALUE ||
(src[0].vstride == 0 && src[0].hstride == 0)) {
/* The value is uniform across all channels */
brw_MOV(p, dst, src[0]);
} else {
brw_set_default_access_mode(p, BRW_ALIGN_16);
struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
swiz_src.swizzle = inst->src[1].ud;
brw_MOV(p, dst, swiz_src);
}
break; break;
case SHADER_OPCODE_CLUSTER_BROADCAST: { case SHADER_OPCODE_CLUSTER_BROADCAST: {