nir/algebraic: Separate has_dot_4x8 into has_sdot_4x8 and has_udot_4x8
Adreno GPUs has native instruction for unsigned and mixed dot_4x8 but not signed dot product. Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13986>
This commit is contained in:
@@ -183,7 +183,7 @@ summarize_repack(nir_builder *b, nir_ssa_def *packed_counts, unsigned num_lds_dw
|
||||
|
||||
nir_ssa_def *lane_id = nir_load_subgroup_invocation(b);
|
||||
nir_ssa_def *shift = nir_iadd_imm_nuw(b, nir_imul_imm(b, lane_id, -4u), num_lds_dwords * 16);
|
||||
bool use_dot = b->shader->options->has_dot_4x8;
|
||||
bool use_dot = b->shader->options->has_udot_4x8;
|
||||
|
||||
if (num_lds_dwords == 1) {
|
||||
nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int(b, 0x01010101), shift), shift);
|
||||
|
@@ -86,7 +86,8 @@ radv_get_nir_options(struct radv_physical_device *device)
|
||||
.lower_iadd_sat = device->rad_info.chip_class <= GFX8,
|
||||
.has_fsub = true,
|
||||
.has_isub = true,
|
||||
.has_dot_4x8 = device->rad_info.has_accelerated_dot_product,
|
||||
.has_sdot_4x8 = device->rad_info.has_accelerated_dot_product,
|
||||
.has_udot_4x8 = device->rad_info.has_accelerated_dot_product,
|
||||
.has_dot_2x16 = device->rad_info.has_accelerated_dot_product,
|
||||
.use_scoped_barrier = true,
|
||||
.max_unroll_iterations = 32,
|
||||
|
@@ -3430,8 +3430,11 @@ typedef struct nir_shader_compiler_options {
|
||||
* for rect texture lowering. */
|
||||
bool has_txs;
|
||||
|
||||
/** Backend supports sdot_4x8 and udot_4x8 opcodes. */
|
||||
bool has_dot_4x8;
|
||||
/** Backend supports sdot_4x8 opcodes. */
|
||||
bool has_sdot_4x8;
|
||||
|
||||
/** Backend supports udot_4x8 opcodes. */
|
||||
bool has_udot_4x8;
|
||||
|
||||
/** Backend supports sudot_4x8 opcodes. */
|
||||
bool has_sudot_4x8;
|
||||
|
@@ -258,8 +258,8 @@ udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
|
||||
('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
|
||||
|
||||
optimizations.extend([
|
||||
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
|
||||
(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'),
|
||||
(('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
|
||||
(('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
|
||||
(('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
|
||||
(('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
|
||||
(('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
|
||||
@@ -269,13 +269,13 @@ optimizations.extend([
|
||||
# overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant
|
||||
# that is less than 0xfffc07fc, then the result cannot overflow ever.
|
||||
(('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
|
||||
(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'),
|
||||
(('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'),
|
||||
|
||||
# For the signed dot-product, the largest positive value is 4*(-128*-128) =
|
||||
# 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We
|
||||
# don't have to worry about that intermediate result overflowing or
|
||||
# underflowing.
|
||||
(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
|
||||
(('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
|
||||
|
||||
(('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
|
||||
|
||||
|
@@ -1066,7 +1066,8 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
|
||||
.lower_insert_word = true,
|
||||
.lower_rotate = true,
|
||||
.lower_to_scalar = true,
|
||||
.has_dot_4x8 = sscreen->info.has_accelerated_dot_product,
|
||||
.has_sdot_4x8 = sscreen->info.has_accelerated_dot_product,
|
||||
.has_udot_4x8 = sscreen->info.has_accelerated_dot_product,
|
||||
.has_dot_2x16 = sscreen->info.has_accelerated_dot_product,
|
||||
.optimize_sample_mask_in = true,
|
||||
.max_unroll_iterations = 128,
|
||||
|
@@ -178,7 +178,8 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
||||
nir_options->lower_bitfield_reverse = devinfo->ver < 7;
|
||||
nir_options->has_iadd3 = devinfo->verx10 >= 125;
|
||||
|
||||
nir_options->has_dot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_sdot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_udot_4x8 = devinfo->ver >= 12;
|
||||
nir_options->has_sudot_4x8 = devinfo->ver >= 12;
|
||||
|
||||
nir_options->lower_int64_options = int64_options;
|
||||
|
Reference in New Issue
Block a user