diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index a178fa2c020..c649b258056 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -290,6 +290,8 @@ struct fd_dev_info { * R8G8B8A8_UNORM in the mutable formats list. */ bool ubwc_all_formats_compatible; + + bool has_compliant_dp4acc; } a7xx; }; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 1dbe7fc7b7e..7d556a7faa2 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -881,6 +881,7 @@ a7xx_750 = A7XXProps( gs_vpc_adjacency_quirk = True, storage_8bit = True, ubwc_all_formats_compatible = True, + has_compliant_dp4acc = True, ) a730_magic_regs = dict( diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index f01ec9a1813..ae7abb3472b 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -206,6 +206,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_dp2acc = dev_info->a6xx.has_dp2acc; compiler->has_dp4acc = dev_info->a6xx.has_dp4acc; + compiler->has_compliant_dp4acc = dev_info->a7xx.has_compliant_dp4acc; if (compiler->gen == 6 && options->shared_push_consts) { compiler->shared_consts_base_offset = 504; @@ -301,14 +302,19 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, if (compiler->gen >= 6) { compiler->nir_options.vectorize_io = true, compiler->nir_options.force_indirect_unrolling = nir_var_all, + compiler->nir_options.lower_device_index_to_zero = true; - compiler->nir_options.lower_device_index_to_zero = true, - compiler->nir_options.has_udot_4x8 = true, - compiler->nir_options.has_sudot_4x8 = true, - compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc; - compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc; - compiler->nir_options.has_udot_4x8_sat = dev_info->a6xx.has_dp2acc; - compiler->nir_options.has_sudot_4x8_sat = dev_info->a6xx.has_dp2acc; + if (dev_info->a6xx.has_dp2acc || dev_info->a6xx.has_dp4acc) { + compiler->nir_options.has_udot_4x8 = + compiler->nir_options.has_udot_4x8_sat = true; + compiler->nir_options.has_sudot_4x8 = + compiler->nir_options.has_sudot_4x8_sat = true; + } + + if (dev_info->a6xx.has_dp4acc && dev_info->a7xx.has_compliant_dp4acc) { + compiler->nir_options.has_sdot_4x8 = + compiler->nir_options.has_sdot_4x8_sat = true; + } } else if (compiler->gen >= 3 && compiler->gen <= 5) { compiler->nir_options.vertex_id_zero_based = true; } else if (compiler->gen <= 2) { diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 3bee39a3185..9b9b3dcc406 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -239,6 +239,7 @@ struct ir3_compiler { bool has_dp2acc; bool has_dp4acc; + bool has_compliant_dp4acc; /* Type to use for 1b nir bools: */ type_t bool_type; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 19b6d6784f4..c2d4797e505 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -356,6 +356,37 @@ emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu, struct ir3_instruction **dst, struct ir3_instruction **src) { + if (ctx->compiler->has_compliant_dp4acc) { + dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, src[2], 0); + + /* This is actually the LHS signedness attribute. + * IR3_SRC_UNSIGNED ~ unsigned LHS (i.e. OpUDot and OpUDotAccSat). + */ + if (alu->op == nir_op_udot_4x8_uadd || + alu->op == nir_op_udot_4x8_uadd_sat) { + dst[0]->cat3.signedness = IR3_SRC_UNSIGNED; + } else { + dst[0]->cat3.signedness = IR3_SRC_MIXED; + } + + /* This is actually the RHS signedness attribute. + * IR3_SRC_PACKED_HIGH ~ signed RHS (i.e. OpSDot and OpSDotAccSat). + */ + if (alu->op == nir_op_sdot_4x8_iadd || + alu->op == nir_op_sdot_4x8_iadd_sat) { + dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH; + } else { + dst[0]->cat3.packed = IR3_SRC_PACKED_LOW; + } + + if (alu->op == nir_op_udot_4x8_uadd_sat || + alu->op == nir_op_sdot_4x8_iadd_sat || + alu->op == nir_op_sudot_4x8_iadd_sat) { + dst[0]->flags |= IR3_INSTR_SAT; + } + return; + } + struct ir3_instruction *accumulator = NULL; if (alu->op == nir_op_udot_4x8_uadd_sat) { accumulator = create_immed(ctx->block, 0); @@ -446,6 +477,8 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) /* it probably isn't worth emulating these with scalar-only ops */ alu->op != nir_op_udot_4x8_uadd && alu->op != nir_op_udot_4x8_uadd_sat && + alu->op != nir_op_sdot_4x8_iadd && + alu->op != nir_op_sdot_4x8_iadd_sat && alu->op != nir_op_sudot_4x8_iadd && alu->op != nir_op_sudot_4x8_iadd_sat && /* not supported in HW, we have to fall back to normal registers */ @@ -896,6 +929,8 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) case nir_op_udot_4x8_uadd: case nir_op_udot_4x8_uadd_sat: + case nir_op_sdot_4x8_iadd: + case nir_op_sdot_4x8_iadd_sat: case nir_op_sudot_4x8_iadd: case nir_op_sudot_4x8_iadd_sat: { if (ctx->compiler->has_dp4acc) {