intel/fs: make scan/reduce work with SIMD32 when it fits 2 registers

When dealing with uint16_t and uint8_t on SIMD32 we can do all the
operations using just 2 registers, so we don't hit the recursion at
the beginning of emit_scan(). Because of that, we need to actually
compute scan/reduce for channels 31:16.

v2: Still missed instructions (Jason).

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
This commit is contained in:
Paulo Zanoni
2019-08-09 15:40:33 -07:00
committed by Jason Ekstrand
parent 7f07046dbc
commit d9ddf5076d

View File

@@ -514,6 +514,16 @@ namespace brw {
right = horiz_offset(tmp, 8 + 4);
set_condmod(mod, ubld.emit(opcode, right, left, right));
}
if (dispatch_width() > 16) {
left = component(tmp, 16 + 3);
right = horiz_offset(tmp, 16 + 4);
set_condmod(mod, ubld.emit(opcode, right, left, right));
left = component(tmp, 24 + 3);
right = horiz_offset(tmp, 24 + 4);
set_condmod(mod, ubld.emit(opcode, right, left, right));
}
}
if (cluster_size > 8 && dispatch_width() > 8) {
@@ -521,6 +531,19 @@ namespace brw {
src_reg left = component(tmp, 7);
dst_reg right = horiz_offset(tmp, 8);
set_condmod(mod, ubld.emit(opcode, right, left, right));
if (dispatch_width() > 16) {
left = component(tmp, 16 + 7);
right = horiz_offset(tmp, 16 + 8);
set_condmod(mod, ubld.emit(opcode, right, left, right));
}
}
if (cluster_size > 16 && dispatch_width() > 16) {
const fs_builder ubld = exec_all().group(16, 0);
src_reg left = component(tmp, 15);
dst_reg right = horiz_offset(tmp, 16);
set_condmod(mod, ubld.emit(opcode, right, left, right));
}
}