From d77bfc117cbed559221d460f270ab21ea27d3032 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 26 Nov 2021 18:57:52 +0200 Subject: [PATCH] tu,ir3: Implement VK_KHR_shader_integer_dot_product - gen4 - has dp4acc and dp2acc, dp4acc is used to implement 4x8 dot product. - gen3 - has dp2acc, in OpenCL blob uses dp2acc for dot product on both get3 and gen4. - gen2 - unknown, lower everything. - gen1 - no dp2acc, lower everything. OpenCL blob doesn't advertise cl_qcom_dot_product8 but still generates code for it. The assembly is more verbose and uses yet to be documented mad32.u16 instruction. Passes: dEQP-VK.spirv_assembly.instruction.compute.opsdotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opudotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsudotkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsdotaccsatkhr.* dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.* dEQP-VK.spirv_assembly.instruction.compute.opsudotaccsatkhr.* Only packed 4x8 unsigned and mixed versions are accelerated. However in theory we should be able to do better for signed version than current NIR lowering. Signed-off-by: Danylo Piliaiev Part-of: --- docs/features.txt | 2 +- src/freedreno/common/freedreno_dev_info.h | 3 + src/freedreno/common/freedreno_devices.py | 4 + src/freedreno/ir3/ir3_compiler.c | 21 +++-- src/freedreno/ir3/ir3_compiler.h | 3 + src/freedreno/ir3/ir3_compiler_nir.c | 95 +++++++++++++++++++++++ src/freedreno/vulkan/tu_device.c | 49 ++++++++++++ 7 files changed, 168 insertions(+), 9 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index ec15bbbb62b..2718d20d9f7 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -493,7 +493,7 @@ Khronos extensions that are not part of any Vulkan version: VK_KHR_pipeline_executable_properties DONE (anv, radv, tu) VK_KHR_push_descriptor DONE (anv, lvp, radv, tu) VK_KHR_shader_clock DONE (anv, radv) - VK_KHR_shader_integer_dot_product DONE (radv) + VK_KHR_shader_integer_dot_product DONE (anv, radv, tu) VK_KHR_shader_non_semantic_info DONE (anv, radv) VK_KHR_shader_subgroup_uniform_control_flow DONE (anv, radv) VK_KHR_shader_terminate_invocation DONE (anv, radv, tu) diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 14149325637..0a1ab85ef9a 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -130,6 +130,9 @@ struct fd_dev_info { bool has_getfiberid; + bool has_dp2acc; + bool has_dp4acc; + struct { uint32_t RB_UNKNOWN_8E04_blit; uint32_t PC_POWER_CNTL; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 5dac42c518c..4bcdec0e237 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -224,6 +224,7 @@ a6xx_gen2 = dict( has_z24uint_s8uint = True, indirect_draw_wfm_quirk = True, depth_bounds_require_depth_test_quirk = True, # TODO: check if true + has_dp2acc = False, # TODO: check if true magic = dict( TPL1_DBG_ECO_CNTL = 0, ), @@ -243,6 +244,7 @@ a6xx_gen3 = dict( has_sample_locations = True, has_ccu_flush_bug = True, has_8bpp_ubwc = False, + has_dp2acc = True, magic = dict( # this seems to be a chicken bit that fixes cubic filtering: TPL1_DBG_ECO_CNTL = 0x1000000, @@ -266,6 +268,8 @@ a6xx_gen4 = dict( has_lpac = True, has_shading_rate = True, has_getfiberid = True, + has_dp2acc = True, + has_dp4acc = True, magic = dict( TPL1_DBG_ECO_CNTL = 0x5008000, ), diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 8d98014632a..077320a26b9 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -182,6 +182,8 @@ static const nir_shader_compiler_options options_a6xx = { .lower_uniforms_to_ubo = true, .lower_device_index_to_zero = true, .use_scoped_barrier = true, + .has_udot_4x8 = true, + .has_sudot_4x8 = true, }; struct ir3_compiler * @@ -212,6 +214,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->max_variable_workgroup_size = 1024; + const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id); + if (compiler->gen >= 6) { compiler->samgq_workaround = true; /* a6xx split the pipeline state into geometry and fragment state, in @@ -241,14 +245,14 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, /* TODO: implement private memory on earlier gen's */ compiler->has_pvtmem = true; - compiler->tess_use_shared = - fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared; + compiler->tess_use_shared = dev_info->a6xx.tess_use_shared; - compiler->storage_16bit = - fd_dev_info(compiler->dev_id)->a6xx.storage_16bit; + compiler->storage_16bit = dev_info->a6xx.storage_16bit; - compiler->has_getfiberid = - fd_dev_info(compiler->dev_id)->a6xx.has_getfiberid; + compiler->has_getfiberid = dev_info->a6xx.has_getfiberid; + + compiler->has_dp2acc = dev_info->a6xx.has_dp2acc; + compiler->has_dp4acc = dev_info->a6xx.has_dp4acc; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; @@ -262,8 +266,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, } if (compiler->gen >= 6) { - compiler->reg_size_vec4 = - fd_dev_info(compiler->dev_id)->a6xx.reg_size_vec4; + compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4; } else if (compiler->gen >= 4) { /* On a4xx-a5xx, using r24.x and above requires using the smallest * threadsize. @@ -309,6 +312,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, if (compiler->gen >= 6) { compiler->nir_options = options_a6xx; + compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc; + compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc; } else { compiler->nir_options = options; } diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index d8bfe1c0925..1767c646ac6 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -172,6 +172,9 @@ struct ir3_compiler { /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */ uint32_t max_variable_workgroup_size; + bool has_dp2acc; + bool has_dp4acc; + /* Type to use for 1b nir bools: */ type_t bool_type; }; diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index bbb58abafb7..17449664e40 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -289,6 +289,76 @@ resize_shift_amount(struct ir3_context *ctx, struct ir3_instruction *src, return ir3_COV(ctx->block, src, TYPE_U32, TYPE_U16); } +static void +emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu, + struct ir3_instruction **dst, + struct ir3_instruction **src) +{ + struct ir3_instruction *accumulator = NULL; + if (alu->op == nir_op_udot_4x8_uadd_sat) { + accumulator = create_immed(ctx->block, 0); + } else { + accumulator = src[2]; + } + + dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0); + + if (alu->op == nir_op_udot_4x8_uadd || + alu->op == nir_op_udot_4x8_uadd_sat) { + dst[0]->cat3.signedness = IR3_SRC_UNSIGNED; + } else { + dst[0]->cat3.signedness = IR3_SRC_MIXED; + } + + /* For some reason (sat) doesn't work in unsigned case so + * we have to emulate it. + */ + if (alu->op == nir_op_udot_4x8_uadd_sat) { + dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0); + dst[0]->flags |= IR3_INSTR_SAT; + } else if (alu->op == nir_op_sudot_4x8_iadd_sat) { + dst[0]->flags |= IR3_INSTR_SAT; + } +} + +static void +emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu, + struct ir3_instruction **dst, + struct ir3_instruction **src) +{ + int signedness; + if (alu->op == nir_op_udot_4x8_uadd || + alu->op == nir_op_udot_4x8_uadd_sat) { + signedness = IR3_SRC_UNSIGNED; + } else { + signedness = IR3_SRC_MIXED; + } + + struct ir3_instruction *accumulator = NULL; + if (alu->op == nir_op_udot_4x8_uadd_sat || + alu->op == nir_op_sudot_4x8_iadd_sat) { + accumulator = create_immed(ctx->block, 0); + } else { + accumulator = src[2]; + } + + dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0); + dst[0]->cat3.packed = IR3_SRC_PACKED_LOW; + dst[0]->cat3.signedness = signedness; + + dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0); + dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH; + dst[0]->cat3.signedness = signedness; + + if (alu->op == nir_op_udot_4x8_uadd_sat) { + dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0); + dst[0]->flags |= IR3_INSTR_SAT; + } else if (alu->op == nir_op_sudot_4x8_iadd_sat) { + dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0); + dst[0]->flags |= IR3_INSTR_SAT; + } +} + static void emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) { @@ -744,6 +814,31 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) dst[0] = ir3_BFREV_B(b, src[0], 0); break; + case nir_op_uadd_sat: + dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); + dst[0]->flags |= IR3_INSTR_SAT; + break; + case nir_op_iadd_sat: + dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0); + dst[0]->flags |= IR3_INSTR_SAT; + break; + + case nir_op_udot_4x8_uadd: + case nir_op_udot_4x8_uadd_sat: + case nir_op_sudot_4x8_iadd: + case nir_op_sudot_4x8_iadd_sat: { + if (ctx->compiler->has_dp4acc) { + emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst, src); + } else if (ctx->compiler->has_dp2acc) { + emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst, src); + } else { + ir3_context_error(ctx, "ALU op should have been lowered: %s\n", + nir_op_infos[alu->op].name); + } + + break; + } + default: ir3_context_error(ctx, "Unhandled ALU op: %s\n", nir_op_infos[alu->op].name); diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 417f24ff3c5..908885eacdb 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -157,6 +157,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_driver_properties = true, .KHR_separate_depth_stencil_layouts = true, .KHR_buffer_device_address = true, + .KHR_shader_integer_dot_product = true, #ifndef TU_USE_KGSL .KHR_timeline_semaphore = true, #endif @@ -790,6 +791,12 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, features->computeFullSubgroups = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: { + VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features = + (VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext; + features->shaderIntegerDotProduct = true; + break; + }; default: break; @@ -1159,6 +1166,48 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, props->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: { + VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props = + (VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext; + + props->integerDotProduct8BitUnsignedAccelerated = false; + props->integerDotProduct8BitSignedAccelerated = false; + props->integerDotProduct8BitMixedSignednessAccelerated = false; + props->integerDotProduct4x8BitPackedUnsignedAccelerated = + pdevice->info->a6xx.has_dp2acc; + /* TODO: we should be able to emulate 4x8BitPackedSigned fast enough */ + props->integerDotProduct4x8BitPackedSignedAccelerated = false; + props->integerDotProduct4x8BitPackedMixedSignednessAccelerated = + pdevice->info->a6xx.has_dp2acc; + props->integerDotProduct16BitUnsignedAccelerated = false; + props->integerDotProduct16BitSignedAccelerated = false; + props->integerDotProduct16BitMixedSignednessAccelerated = false; + props->integerDotProduct32BitUnsignedAccelerated = false; + props->integerDotProduct32BitSignedAccelerated = false; + props->integerDotProduct32BitMixedSignednessAccelerated = false; + props->integerDotProduct64BitUnsignedAccelerated = false; + props->integerDotProduct64BitSignedAccelerated = false; + props->integerDotProduct64BitMixedSignednessAccelerated = false; + props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false; + props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false; + props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false; + props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = + pdevice->info->a6xx.has_dp2acc; + /* TODO: we should be able to emulate Saturating4x8BitPackedSigned fast enough */ + props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false; + props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = + pdevice->info->a6xx.has_dp2acc; + props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false; + props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false; + props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false; + props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false; + props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false; + props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false; + props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false; + props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false; + props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false; + break; + } default: break;