nir/algebraic: Separate has_dot_4x8 into has_sdot_4x8 and has_udot_4x8

Adreno GPUs has native instruction for unsigned and mixed dot_4x8 but
not signed dot product.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13986>
This commit is contained in:
Danylo Piliaiev
2021-11-26 19:27:03 +02:00
parent c1d5c318bc
commit b8d486f298
6 changed files with 16 additions and 10 deletions

View File

@@ -183,7 +183,7 @@ summarize_repack(nir_builder *b, nir_ssa_def *packed_counts, unsigned num_lds_dw
nir_ssa_def *lane_id = nir_load_subgroup_invocation(b);
nir_ssa_def *shift = nir_iadd_imm_nuw(b, nir_imul_imm(b, lane_id, -4u), num_lds_dwords * 16);
bool use_dot = b->shader->options->has_dot_4x8;
bool use_dot = b->shader->options->has_udot_4x8;
if (num_lds_dwords == 1) {
nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int(b, 0x01010101), shift), shift);

View File

@@ -86,7 +86,8 @@ radv_get_nir_options(struct radv_physical_device *device)
.lower_iadd_sat = device->rad_info.chip_class <= GFX8,
.has_fsub = true,
.has_isub = true,
.has_dot_4x8 = device->rad_info.has_accelerated_dot_product,
.has_sdot_4x8 = device->rad_info.has_accelerated_dot_product,
.has_udot_4x8 = device->rad_info.has_accelerated_dot_product,
.has_dot_2x16 = device->rad_info.has_accelerated_dot_product,
.use_scoped_barrier = true,
.max_unroll_iterations = 32,

View File

@@ -3430,8 +3430,11 @@ typedef struct nir_shader_compiler_options {
* for rect texture lowering. */
bool has_txs;
/** Backend supports sdot_4x8 and udot_4x8 opcodes. */
bool has_dot_4x8;
/** Backend supports sdot_4x8 opcodes. */
bool has_sdot_4x8;
/** Backend supports udot_4x8 opcodes. */
bool has_udot_4x8;
/** Backend supports sudot_4x8 opcodes. */
bool has_sudot_4x8;

View File

@@ -258,8 +258,8 @@ udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
optimizations.extend([
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'),
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
(('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
(('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
(('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
@@ -269,13 +269,13 @@ optimizations.extend([
# overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant
# that is less than 0xfffc07fc, then the result cannot overflow ever.
(('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'),
(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'),
# For the signed dot-product, the largest positive value is 4*(-128*-128) =
# 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We
# don't have to worry about that intermediate result overflowing or
# underflowing.
(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
(('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),

View File

@@ -1066,7 +1066,8 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
.lower_insert_word = true,
.lower_rotate = true,
.lower_to_scalar = true,
.has_dot_4x8 = sscreen->info.has_accelerated_dot_product,
.has_sdot_4x8 = sscreen->info.has_accelerated_dot_product,
.has_udot_4x8 = sscreen->info.has_accelerated_dot_product,
.has_dot_2x16 = sscreen->info.has_accelerated_dot_product,
.optimize_sample_mask_in = true,
.max_unroll_iterations = 128,

View File

@@ -178,7 +178,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
nir_options->lower_bitfield_reverse = devinfo->ver < 7;
nir_options->has_iadd3 = devinfo->verx10 >= 125;
nir_options->has_dot_4x8 = devinfo->ver >= 12;
nir_options->has_sdot_4x8 = devinfo->ver >= 12;
nir_options->has_udot_4x8 = devinfo->ver >= 12;
nir_options->has_sudot_4x8 = devinfo->ver >= 12;
nir_options->lower_int64_options = int64_options;