intel/compiler: Lower 8-bit ops to 16-bit in NIR on all platforms
This fixes the Crucible func.shader.shift.int8_t test on Gen8 and Gen9. See https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. With the previous optimizations in place, this change seems to improve the quality of the generated code. Comparing a couple Vulkan CTS tests on Skylake had the following results. dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag: SIMD8 shader: 36 instructions. 1 loops. 3822 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 27 instructions. 1 loops. 2742 cycles. 0:0 spills:fills, 5 sends dEQP-VK.spirv_assembly.type.vec3.i8.max_frag: SIMD8 shader: 39 instructions. 1 loops. 3922 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 37 instructions. 1 loops. 3682 cycles. 0:0 spills:fills, 5 sends Reviewed-by: Francisco Jerez <currojerez@riseup.net> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
This commit is contained in:
@@ -193,9 +193,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
|||||||
nir_options->lower_int64_options = int64_options;
|
nir_options->lower_int64_options = int64_options;
|
||||||
nir_options->lower_doubles_options = fp64_options;
|
nir_options->lower_doubles_options = fp64_options;
|
||||||
|
|
||||||
/* Starting with Gfx11, we lower away 8-bit arithmetic */
|
|
||||||
nir_options->support_8bit_alu = devinfo->ver < 11;
|
|
||||||
|
|
||||||
nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
|
nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT;
|
||||||
|
|
||||||
nir_options->force_indirect_unrolling |=
|
nir_options->force_indirect_unrolling |=
|
||||||
|
@@ -993,8 +993,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
|||||||
|
|
||||||
default:
|
default:
|
||||||
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
|
||||||
assert((devinfo->ver == 8 || devinfo->ver == 9) ||
|
assert(type_sz(op[i].type) > 1);
|
||||||
type_sz(op[i].type) > 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -1836,6 +1835,35 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
|
|||||||
case nir_op_bitfield_insert:
|
case nir_op_bitfield_insert:
|
||||||
unreachable("not reached: should have been lowered");
|
unreachable("not reached: should have been lowered");
|
||||||
|
|
||||||
|
/* For all shift operations:
|
||||||
|
*
|
||||||
|
* Gen4 - Gen7: After application of source modifiers, the low 5-bits of
|
||||||
|
* src1 are used an unsigned value for the shift count.
|
||||||
|
*
|
||||||
|
* Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
|
||||||
|
* 6-bit of src1 are used.
|
||||||
|
*
|
||||||
|
* Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
|
||||||
|
* W or UW src0).
|
||||||
|
*
|
||||||
|
* The implication is that the following instruction will produce a
|
||||||
|
* different result on Gen9+ than on previous platforms:
|
||||||
|
*
|
||||||
|
* shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW
|
||||||
|
*
|
||||||
|
* where Gen9+ will shift by zero, and earlier platforms will shift by 16.
|
||||||
|
*
|
||||||
|
* This does not seem to be the case. Experimentally, it has been
|
||||||
|
* determined that shifts of 16-bit values on Gen8 behave properly. Shifts
|
||||||
|
* of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit
|
||||||
|
* values, so those platforms were not tested. No features expose access
|
||||||
|
* to 8- or 16-bit types on Gen7 or earlier, so those platforms were not
|
||||||
|
* tested either. See
|
||||||
|
* https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
|
||||||
|
*
|
||||||
|
* This is part of the reason 8-bit values are lowered to 16-bit on all
|
||||||
|
* platforms.
|
||||||
|
*/
|
||||||
case nir_op_ishl:
|
case nir_op_ishl:
|
||||||
bld.SHL(result, op[0], op[1]);
|
bld.SHL(result, op[0], op[1]);
|
||||||
break;
|
break;
|
||||||
|
@@ -675,15 +675,14 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
|
|||||||
assert(!"Should have been lowered by nir_opt_algebraic.");
|
assert(!"Should have been lowered by nir_opt_algebraic.");
|
||||||
return 0;
|
return 0;
|
||||||
default:
|
default:
|
||||||
if (devinfo->ver >= 11) {
|
if (nir_op_infos[alu->op].num_inputs >= 2 &&
|
||||||
if (nir_op_infos[alu->op].num_inputs >= 2 &&
|
alu->dest.dest.ssa.bit_size == 8)
|
||||||
alu->dest.dest.ssa.bit_size == 8)
|
return 16;
|
||||||
return 16;
|
|
||||||
|
if (nir_alu_instr_is_comparison(alu) &&
|
||||||
|
alu->src[0].src.ssa->bit_size == 8)
|
||||||
|
return 16;
|
||||||
|
|
||||||
if (nir_alu_instr_is_comparison(alu) &&
|
|
||||||
alu->src[0].src.ssa->bit_size == 8)
|
|
||||||
return 16;
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -704,7 +703,7 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
|
|||||||
case nir_intrinsic_quad_swap_horizontal:
|
case nir_intrinsic_quad_swap_horizontal:
|
||||||
case nir_intrinsic_quad_swap_vertical:
|
case nir_intrinsic_quad_swap_vertical:
|
||||||
case nir_intrinsic_quad_swap_diagonal:
|
case nir_intrinsic_quad_swap_diagonal:
|
||||||
if (intrin->src[0].ssa->bit_size == 8 && devinfo->ver >= 11)
|
if (intrin->src[0].ssa->bit_size == 8)
|
||||||
return 16;
|
return 16;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
@@ -737,7 +736,7 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
|
|||||||
|
|
||||||
case nir_instr_type_phi: {
|
case nir_instr_type_phi: {
|
||||||
nir_phi_instr *phi = nir_instr_as_phi(instr);
|
nir_phi_instr *phi = nir_instr_as_phi(instr);
|
||||||
if (devinfo->ver >= 11 && phi->dest.ssa.bit_size == 8)
|
if (phi->dest.ssa.bit_size == 8)
|
||||||
return 16;
|
return 16;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user