From d77bfc117cbed559221d460f270ab21ea27d3032 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Fri, 26 Nov 2021 18:57:52 +0200
Subject: [PATCH] tu,ir3: Implement VK_KHR_shader_integer_dot_product

- gen4 - has dp4acc and dp2acc, dp4acc is used to implement
  4x8 dot product.
- gen3 - has dp2acc, in OpenCL blob uses dp2acc for dot product
  on both get3 and gen4.
- gen2 - unknown, lower everything.
- gen1 - no dp2acc, lower everything. OpenCL blob doesn't advertise
  cl_qcom_dot_product8 but still generates code for it.
  The assembly is more verbose and uses yet to be documented
  mad32.u16 instruction.

Passes:
 dEQP-VK.spirv_assembly.instruction.compute.opsdotkhr.*
 dEQP-VK.spirv_assembly.instruction.compute.opudotkhr.*
 dEQP-VK.spirv_assembly.instruction.compute.opsudotkhr.*
 dEQP-VK.spirv_assembly.instruction.compute.opsdotaccsatkhr.*
 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.*
 dEQP-VK.spirv_assembly.instruction.compute.opsudotaccsatkhr.*

Only packed 4x8 unsigned and mixed versions are accelerated.
However in theory we should be able to do better for signed version
than current NIR lowering.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13986>
---
 docs/features.txt                         |  2 +-
 src/freedreno/common/freedreno_dev_info.h |  3 +
 src/freedreno/common/freedreno_devices.py |  4 +
 src/freedreno/ir3/ir3_compiler.c          | 21 +++--
 src/freedreno/ir3/ir3_compiler.h          |  3 +
 src/freedreno/ir3/ir3_compiler_nir.c      | 95 +++++++++++++++++++++++
 src/freedreno/vulkan/tu_device.c          | 49 ++++++++++++
 7 files changed, 168 insertions(+), 9 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index ec15bbbb62b..2718d20d9f7 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -493,7 +493,7 @@ Khronos extensions that are not part of any Vulkan version:
   VK_KHR_pipeline_executable_properties                 DONE (anv, radv, tu)
   VK_KHR_push_descriptor                                DONE (anv, lvp, radv, tu)
   VK_KHR_shader_clock                                   DONE (anv, radv)
-  VK_KHR_shader_integer_dot_product                     DONE (radv)
+  VK_KHR_shader_integer_dot_product                     DONE (anv, radv, tu)
   VK_KHR_shader_non_semantic_info                       DONE (anv, radv)
   VK_KHR_shader_subgroup_uniform_control_flow           DONE (anv, radv)
   VK_KHR_shader_terminate_invocation                    DONE (anv, radv, tu)
diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index 14149325637..0a1ab85ef9a 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -130,6 +130,9 @@ struct fd_dev_info {
 
          bool has_getfiberid;
 
+         bool has_dp2acc;
+         bool has_dp4acc;
+
          struct {
             uint32_t RB_UNKNOWN_8E04_blit;
             uint32_t PC_POWER_CNTL;
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index 5dac42c518c..4bcdec0e237 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -224,6 +224,7 @@ a6xx_gen2 = dict(
         has_z24uint_s8uint = True,
         indirect_draw_wfm_quirk = True,
         depth_bounds_require_depth_test_quirk = True, # TODO: check if true
+        has_dp2acc = False, # TODO: check if true
         magic = dict(
             TPL1_DBG_ECO_CNTL = 0,
         ),
@@ -243,6 +244,7 @@ a6xx_gen3 = dict(
         has_sample_locations = True,
         has_ccu_flush_bug = True,
         has_8bpp_ubwc = False,
+        has_dp2acc = True,
         magic = dict(
             # this seems to be a chicken bit that fixes cubic filtering:
             TPL1_DBG_ECO_CNTL = 0x1000000,
@@ -266,6 +268,8 @@ a6xx_gen4 = dict(
         has_lpac = True,
         has_shading_rate = True,
         has_getfiberid = True,
+        has_dp2acc = True,
+        has_dp4acc = True,
         magic = dict(
             TPL1_DBG_ECO_CNTL = 0x5008000,
         ),
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 8d98014632a..077320a26b9 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -182,6 +182,8 @@ static const nir_shader_compiler_options options_a6xx = {
    .lower_uniforms_to_ubo = true,
    .lower_device_index_to_zero = true,
    .use_scoped_barrier = true,
+   .has_udot_4x8 = true,
+   .has_sudot_4x8 = true,
 };
 
 struct ir3_compiler *
@@ -212,6 +214,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
 
    compiler->max_variable_workgroup_size = 1024;
 
+   const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
+
    if (compiler->gen >= 6) {
       compiler->samgq_workaround = true;
       /* a6xx split the pipeline state into geometry and fragment state, in
@@ -241,14 +245,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       /* TODO: implement private memory on earlier gen's */
       compiler->has_pvtmem = true;
 
-      compiler->tess_use_shared =
-            fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared;
+      compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
 
-      compiler->storage_16bit =
-            fd_dev_info(compiler->dev_id)->a6xx.storage_16bit;
+      compiler->storage_16bit = dev_info->a6xx.storage_16bit;
 
-      compiler->has_getfiberid =
-            fd_dev_info(compiler->dev_id)->a6xx.has_getfiberid;
+      compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
+
+      compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
+      compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
@@ -262,8 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
    }
 
    if (compiler->gen >= 6) {
-      compiler->reg_size_vec4 =
-            fd_dev_info(compiler->dev_id)->a6xx.reg_size_vec4;
+      compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
    } else if (compiler->gen >= 4) {
       /* On a4xx-a5xx, using r24.x and above requires using the smallest
        * threadsize.
@@ -309,6 +312,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
 
    if (compiler->gen >= 6) {
       compiler->nir_options = options_a6xx;
+      compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
+      compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
    } else {
       compiler->nir_options = options;
    }
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index d8bfe1c0925..1767c646ac6 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -172,6 +172,9 @@ struct ir3_compiler {
    /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
    uint32_t max_variable_workgroup_size;
 
+   bool has_dp2acc;
+   bool has_dp4acc;
+
    /* Type to use for 1b nir bools: */
    type_t bool_type;
 };
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index bbb58abafb7..17449664e40 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -289,6 +289,76 @@ resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src,
    return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16);
 }
 
+static void
+emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
+                           struct ir3_instruction **dst,
+                           struct ir3_instruction **src)
+{
+   struct ir3_instruction *accumulator = NULL;
+   if (alu->op == nir_op_udot_4x8_uadd_sat) {
+      accumulator = create_immed(ctx->block, 0);
+   } else {
+      accumulator = src[2];
+   }
+
+   dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
+
+   if (alu->op == nir_op_udot_4x8_uadd ||
+       alu->op == nir_op_udot_4x8_uadd_sat) {
+      dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
+   } else {
+      dst[0]->cat3.signedness = IR3_SRC_MIXED;
+   }
+
+   /* For some reason (sat) doesn't work in unsigned case so
+    * we have to emulate it.
+    */
+   if (alu->op == nir_op_udot_4x8_uadd_sat) {
+      dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
+      dst[0]->flags |= IR3_INSTR_SAT;
+   } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
+      dst[0]->flags |= IR3_INSTR_SAT;
+   }
+}
+
+static void
+emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
+                           struct ir3_instruction **dst,
+                           struct ir3_instruction **src)
+{
+   int signedness;
+   if (alu->op == nir_op_udot_4x8_uadd ||
+       alu->op == nir_op_udot_4x8_uadd_sat) {
+      signedness = IR3_SRC_UNSIGNED;
+   } else {
+      signedness = IR3_SRC_MIXED;
+   }
+
+   struct ir3_instruction *accumulator = NULL;
+   if (alu->op == nir_op_udot_4x8_uadd_sat ||
+       alu->op == nir_op_sudot_4x8_iadd_sat) {
+      accumulator = create_immed(ctx->block, 0);
+   } else {
+      accumulator = src[2];
+   }
+
+   dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
+   dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
+   dst[0]->cat3.signedness = signedness;
+
+   dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0);
+   dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
+   dst[0]->cat3.signedness = signedness;
+
+   if (alu->op == nir_op_udot_4x8_uadd_sat) {
+      dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
+      dst[0]->flags |= IR3_INSTR_SAT;
+   } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
+      dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0);
+      dst[0]->flags |= IR3_INSTR_SAT;
+   }
+}
+
 static void
 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
 {
@@ -744,6 +814,31 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       dst[0] = ir3_BFREV_B(b, src[0], 0);
       break;
 
+   case nir_op_uadd_sat:
+      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+      dst[0]->flags |= IR3_INSTR_SAT;
+      break;
+   case nir_op_iadd_sat:
+      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
+      dst[0]->flags |= IR3_INSTR_SAT;
+      break;
+
+   case nir_op_udot_4x8_uadd:
+   case nir_op_udot_4x8_uadd_sat:
+   case nir_op_sudot_4x8_iadd:
+   case nir_op_sudot_4x8_iadd_sat: {
+      if (ctx->compiler->has_dp4acc) {
+         emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src);
+      } else if (ctx->compiler->has_dp2acc) {
+         emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src);
+      } else {
+         ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
+                           nir_op_infos[alu->op].name);
+      }
+
+      break;
+   }
+
    default:
       ir3_context_error(ctx, "Unhandled ALU op: %s\n",
                         nir_op_infos[alu->op].name);
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index 417f24ff3c5..908885eacdb 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -157,6 +157,7 @@ get_device_extensions(const struct tu_physical_device *device,
       .KHR_driver_properties = true,
       .KHR_separate_depth_stencil_layouts = true,
       .KHR_buffer_device_address = true,
+      .KHR_shader_integer_dot_product = true,
 #ifndef TU_USE_KGSL
       .KHR_timeline_semaphore = true,
 #endif
@@ -790,6 +791,12 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->computeFullSubgroups = true;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: {
+         VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features =
+            (VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext;
+         features->shaderIntegerDotProduct = true;
+         break;
+      };
 
       default:
          break;
@@ -1159,6 +1166,48 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: {
+         VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props =
+            (VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext;
+
+         props->integerDotProduct8BitUnsignedAccelerated = false;
+         props->integerDotProduct8BitSignedAccelerated = false;
+         props->integerDotProduct8BitMixedSignednessAccelerated = false;
+         props->integerDotProduct4x8BitPackedUnsignedAccelerated =
+            pdevice->info->a6xx.has_dp2acc;
+         /* TODO: we should be able to emulate 4x8BitPackedSigned fast enough */
+         props->integerDotProduct4x8BitPackedSignedAccelerated = false;
+         props->integerDotProduct4x8BitPackedMixedSignednessAccelerated =
+            pdevice->info->a6xx.has_dp2acc;
+         props->integerDotProduct16BitUnsignedAccelerated = false;
+         props->integerDotProduct16BitSignedAccelerated = false;
+         props->integerDotProduct16BitMixedSignednessAccelerated = false;
+         props->integerDotProduct32BitUnsignedAccelerated = false;
+         props->integerDotProduct32BitSignedAccelerated = false;
+         props->integerDotProduct32BitMixedSignednessAccelerated = false;
+         props->integerDotProduct64BitUnsignedAccelerated = false;
+         props->integerDotProduct64BitSignedAccelerated = false;
+         props->integerDotProduct64BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated =
+            pdevice->info->a6xx.has_dp2acc;
+         /* TODO: we should be able to emulate Saturating4x8BitPackedSigned fast enough */
+         props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated =
+            pdevice->info->a6xx.has_dp2acc;
+         props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
+         break;
+      }
 
       default:
          break;