From 201851a3d12db81ebcf7ae085da15a943a947324 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 26 Jul 2022 12:25:30 +0200 Subject: [PATCH] tu: Initial implementation of VK_EXT_inline_uniform_block This is a trivial implementation where we just insert a UBO descriptor pointing to the actual data and then treat it as a normal UBO everywhere else. In theory an indirect CP_LOAD_STATE would be more efficient than ldc.k to preload inline uniform blocks to constants. However we will always need the UBO descriptor anyway, even if we lower the limits enough to always be able to preload them, because with variable pointers we may have a pointer that could be to either an inline uniform block or regular uniform block. So, using an indirect CP_LOAD_STATE should be an optimization on top of this. Part-of: --- docs/features.txt | 2 +- src/freedreno/vulkan/tu_descriptor_set.c | 281 +++++++++++++++++++---- src/freedreno/vulkan/tu_descriptor_set.h | 1 + src/freedreno/vulkan/tu_device.c | 25 +- src/freedreno/vulkan/tu_pipeline.c | 4 +- src/freedreno/vulkan/tu_shader.c | 14 +- 6 files changed, 273 insertions(+), 54 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index aebca5368c7..97a435d0ead 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -485,7 +485,7 @@ Vulkan 1.3 -- all DONE: anv, radv, lvp VK_EXT_4444_formats DONE (anv, lvp, radv, tu, v3dv, vn) VK_EXT_extended_dynamic_state DONE (anv, lvp, radv, tu, vn) VK_EXT_extended_dynamic_state2 DONE (anv, lvp, radv, tu, vn) - VK_EXT_inline_uniform_block DONE (anv, lvp, radv, v3dv, vn) + VK_EXT_inline_uniform_block DONE (anv, lvp, radv, tu, v3dv, vn) VK_EXT_pipeline_creation_cache_control DONE (anv, lvp, radv, tu, v3dv, vn) VK_EXT_pipeline_creation_feedback DONE (anv, lvp, radv, tu, v3dv, vn) VK_EXT_private_data DONE (anv, lvp, pvr, radv, tu, v3dv, vn) diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index 6b69f1ab064..8751e1aa0c7 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -35,7 +35,9 @@ pool_base(struct tu_descriptor_pool *pool) } static uint32_t -descriptor_size(struct tu_device *dev, VkDescriptorType type) +descriptor_size(struct tu_device *dev, + const VkDescriptorSetLayoutBinding *binding, + VkDescriptorType type) { switch (type) { case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: @@ -63,6 +65,9 @@ descriptor_size(struct tu_device *dev, VkDescriptorType type) } else { return A6XX_TEX_CONST_DWORDS * 4; } + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + return A6XX_TEX_CONST_DWORDS * 4 + + ALIGN(binding->descriptorCount, A6XX_TEX_CONST_DWORDS * 4); default: return A6XX_TEX_CONST_DWORDS * 4; } @@ -76,12 +81,13 @@ is_dynamic(VkDescriptorType type) } static uint32_t -mutable_descriptor_size(struct tu_device *dev, const VkMutableDescriptorTypeListVALVE *list) +mutable_descriptor_size(struct tu_device *dev, + const VkMutableDescriptorTypeListVALVE *list) { uint32_t max_size = 0; for (uint32_t i = 0; i < list->descriptorTypeCount; i++) { - uint32_t size = descriptor_size(dev, list->pDescriptorTypes[i]); + uint32_t size = descriptor_size(dev, NULL, list->pDescriptorTypes[i]); max_size = MAX2(max_size, size); } @@ -163,6 +169,7 @@ tu_CreateDescriptorSetLayout( set_layout->binding_count = num_bindings; set_layout->shader_stages = 0; set_layout->has_immutable_samplers = false; + set_layout->has_inline_uniforms = false; set_layout->size = 0; uint32_t dynamic_offset_size = 0; @@ -172,7 +179,9 @@ tu_CreateDescriptorSetLayout( uint32_t b = binding->binding; set_layout->binding[b].type = binding->descriptorType; - set_layout->binding[b].array_size = binding->descriptorCount; + set_layout->binding[b].array_size = + binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ? + 1 : binding->descriptorCount; set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_size; set_layout->binding[b].shader_stages = binding->stageFlags; @@ -184,9 +193,13 @@ tu_CreateDescriptorSetLayout( set_layout->binding[b].size = mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[j]); } else { - set_layout->binding[b].size = descriptor_size(device, binding->descriptorType); + set_layout->binding[b].size = + descriptor_size(device, binding, binding->descriptorType); } + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + set_layout->has_inline_uniforms = true; + if (variable_flags && binding->binding < variable_flags->bindingCount && (variable_flags->pBindingFlags[binding->binding] & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) { @@ -231,12 +244,12 @@ tu_CreateDescriptorSetLayout( } } + uint32_t size = + ALIGN_POT(set_layout->binding[b].array_size * set_layout->binding[b].size, 4 * A6XX_TEX_CONST_DWORDS); if (is_dynamic(binding->descriptorType)) { - dynamic_offset_size += - binding->descriptorCount * set_layout->binding[b].size; + dynamic_offset_size += size; } else { - set_layout->size += - binding->descriptorCount * set_layout->binding[b].size; + set_layout->size += size; } set_layout->shader_stages |= binding->stageFlags; @@ -323,7 +336,8 @@ tu_GetDescriptorSetLayoutSupport( /* Don't support the input attachement and combined image sampler type * for mutable descriptors */ if (list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT || - list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + list->pDescriptorTypes[j] == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { supported = false; goto out; } @@ -332,7 +346,7 @@ tu_GetDescriptorSetLayoutSupport( descriptor_sz = mutable_descriptor_size(device, &mutable_info->pMutableDescriptorTypeLists[i]); } else { - descriptor_sz = descriptor_size(device, binding->descriptorType); + descriptor_sz = descriptor_size(device, binding, binding->descriptorType); } uint64_t descriptor_alignment = 4 * A6XX_TEX_CONST_DWORDS; @@ -342,10 +356,16 @@ tu_GetDescriptorSetLayoutSupport( size = ALIGN_POT(size, descriptor_alignment); uint64_t max_count = MAX_SET_SIZE; - if (descriptor_sz) + unsigned descriptor_count = binding->descriptorCount; + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + max_count = MAX_SET_SIZE - size; + descriptor_count = descriptor_sz; + descriptor_sz = 1; + } else if (descriptor_sz) { max_count = (MAX_SET_SIZE - size) / descriptor_sz; + } - if (max_count < binding->descriptorCount) { + if (max_count < descriptor_count) { supported = false; } @@ -356,7 +376,7 @@ tu_GetDescriptorSetLayoutSupport( variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count); } - size += binding->descriptorCount * descriptor_sz; + size += descriptor_count * descriptor_sz; } out: @@ -532,9 +552,15 @@ tu_descriptor_set_create(struct tu_device *device, uint32_t layout_size = layout->size; if (variable_count) { assert(layout->has_variable_descriptors); - uint32_t stride = layout->binding[layout->binding_count - 1].size; - layout_size = layout->binding[layout->binding_count - 1].offset + - *variable_count * stride; + struct tu_descriptor_set_binding_layout *binding = + &layout->binding[layout->binding_count - 1]; + if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + layout_size = binding->offset + A6XX_TEX_CONST_DWORDS * 4 + + ALIGN(*variable_count, A6XX_TEX_CONST_DWORDS * 4); + } else { + uint32_t stride = binding->size; + layout_size = binding->offset + *variable_count * stride; + } } if (layout_size) { @@ -607,6 +633,24 @@ tu_descriptor_set_create(struct tu_device *device, } } + if (layout->has_inline_uniforms) { + for (unsigned i = 0; i < layout->binding_count; i++) { + if (layout->binding[i].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) + continue; + + uint32_t *ptr = set->mapped_ptr + layout->binding[i].offset / 4; + uint64_t va = set->va + layout->binding[i].offset + + A6XX_TEX_CONST_DWORDS * 4; + uint32_t size = + (layout->has_variable_descriptors && i == layout->binding_count - 1) ? + *variable_count : layout->binding[i].size - A6XX_TEX_CONST_DWORDS * 4; + size = ALIGN_POT(size, 16) / 16; + + ptr[0] = A6XX_UBO_0_BASE_LO(va); + ptr[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(size); + } + } + tu_descriptor_set_layout_ref(layout); list_addtail(&set->pool_link, &pool->desc_sets); @@ -654,10 +698,29 @@ tu_CreateDescriptorPool(VkDevice _device, vk_find_struct_const( pCreateInfo->pNext, MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE); + const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info = + vk_find_struct_const(pCreateInfo->pNext, + DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO); + + if (inline_info) { + /* In addition to the size of the descriptors, we have to factor in the + * padding for each binding. The sizes are 4 aligned but we have to + * align to a descriptor size, and in the worst case each inline + * binding has a size of 4 bytes and we have to pad each one out. + */ + bo_size += (2 * 4 * A6XX_TEX_CONST_DWORDS - 4) * + inline_info->maxInlineUniformBlockBindings; + } + for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { const VkDescriptorPoolSize *pool_size = &pCreateInfo->pPoolSizes[i]; switch (pool_size->type) { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + dynamic_size += descriptor_size(device, NULL, pool_size->type) * + pool_size->descriptorCount; + break; case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE: if (mutable_info && i < mutable_info->mutableDescriptorTypeListCount && mutable_info->pMutableDescriptorTypeLists[i].descriptorTypeCount > 0) { @@ -669,17 +732,15 @@ tu_CreateDescriptorPool(VkDevice _device, bo_size += 2 * A6XX_TEX_CONST_DWORDS * 4 * pool_size->descriptorCount; } - continue; + break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: + bo_size += pool_size->descriptorCount; + break; default: + bo_size += descriptor_size(device, NULL, pool_size->type) * + pool_size->descriptorCount; break; } - - const uint32_t desc_size = descriptor_size(device, pool_size->type) * - pool_size->descriptorCount; - if (is_dynamic(pool_size->type)) - dynamic_size += desc_size; - else - bo_size += desc_size; } if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) { @@ -1024,7 +1085,47 @@ tu_update_descriptor_sets(const struct tu_device *device, const struct tu_sampler *samplers = tu_immutable_samplers(set->layout, binding_layout); - ptr += (binding_layout->size / 4) * writeset->dstArrayElement; + if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* We need to respect this note: + * + * The same behavior applies to bindings with a descriptor type of + * VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK where descriptorCount + * specifies the number of bytes to update while dstArrayElement + * specifies the starting byte offset, thus in this case if the + * dstBinding has a smaller byte size than the sum of + * dstArrayElement and descriptorCount, then the remainder will be + * used to update the subsequent binding - dstBinding+1 starting + * at offset zero. This falls out as a special case of the above + * rule. + * + * This means we can't just do a straight memcpy, because due to + * alignment padding and the descriptor itself there are gaps between + * sequential bindings. We have to loop over each binding updated. + */ + const VkWriteDescriptorSetInlineUniformBlock *inline_write = + vk_find_struct_const(writeset->pNext, + WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK); + uint32_t remaining = inline_write->dataSize; + const uint8_t *src = inline_write->pData; + uint32_t dst_offset = writeset->dstArrayElement; + do { + uint8_t *dst = (uint8_t *)(ptr + A6XX_TEX_CONST_DWORDS) + dst_offset; + uint32_t binding_size = + binding_layout->size - A6XX_TEX_CONST_DWORDS * 4 - dst_offset; + uint32_t to_write = MIN2(remaining, binding_size); + memcpy(dst, src, to_write); + + binding_layout++; + ptr = set->mapped_ptr + binding_layout->offset / 4; + dst_offset = 0; + src += to_write; + remaining -= to_write; + } while (remaining > 0); + + continue; + } + + ptr += binding_layout->size / 4 * writeset->dstArrayElement; for (j = 0; j < writeset->descriptorCount; ++j) { switch(writeset->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: @@ -1096,6 +1197,44 @@ tu_update_descriptor_sets(const struct tu_device *device, dst_ptr += dst_binding_layout->offset / 4; } + if (src_binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + uint32_t remaining = copyset->descriptorCount; + uint32_t src_start = copyset->srcArrayElement; + uint32_t dst_start = copyset->dstArrayElement; + uint8_t *src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS) + src_start; + uint8_t *dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS) + dst_start; + uint32_t src_remaining = + src_binding_layout->size - src_start - 4 * A6XX_TEX_CONST_DWORDS; + uint32_t dst_remaining = + dst_binding_layout->size - dst_start - 4 * A6XX_TEX_CONST_DWORDS; + do { + uint32_t to_write = MIN3(remaining, src_remaining, dst_remaining); + memcpy(dst, src, to_write); + + src += to_write; + dst += to_write; + src_remaining -= to_write; + dst_remaining -= to_write; + remaining -= to_write; + + if (src_remaining == 0) { + src_binding_layout++; + src_ptr = src_set->mapped_ptr + src_binding_layout->offset / 4; + src = (uint8_t *)(src_ptr + A6XX_TEX_CONST_DWORDS); + src_remaining = src_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS; + } + + if (dst_remaining == 0) { + dst_binding_layout++; + dst_ptr = dst_set->mapped_ptr + dst_binding_layout->offset / 4; + dst = (uint8_t *)(dst_ptr + A6XX_TEX_CONST_DWORDS); + dst_remaining = dst_binding_layout->size - 4 * A6XX_TEX_CONST_DWORDS; + } + } while (remaining > 0); + + continue; + } + src_ptr += src_binding_layout->size * copyset->srcArrayElement / 4; dst_ptr += dst_binding_layout->size * copyset->dstArrayElement / 4; @@ -1136,17 +1275,7 @@ tu_CreateDescriptorUpdateTemplate( TU_FROM_HANDLE(tu_device, device, _device); struct tu_descriptor_set_layout *set_layout = NULL; const uint32_t entry_count = pCreateInfo->descriptorUpdateEntryCount; - const size_t size = - sizeof(struct tu_descriptor_update_template) + - sizeof(struct tu_descriptor_update_template_entry) * entry_count; - struct tu_descriptor_update_template *templ; - - templ = vk_object_alloc(&device->vk, pAllocator, size, - VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE); - if (!templ) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - templ->entry_count = entry_count; + uint32_t dst_entry_count = 0; if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) { TU_FROM_HANDLE(tu_pipeline_layout, pipeline_layout, pCreateInfo->pipelineLayout); @@ -1156,14 +1285,54 @@ tu_CreateDescriptorUpdateTemplate( */ assert(pCreateInfo->set < MAX_SETS); set_layout = pipeline_layout->set[pCreateInfo->set].layout; - - templ->bind_point = pCreateInfo->pipelineBindPoint; } else { TU_FROM_HANDLE(tu_descriptor_set_layout, _set_layout, pCreateInfo->descriptorSetLayout); set_layout = _set_layout; } + for (uint32_t i = 0; i < entry_count; i++) { + const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i]; + if (entry->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + dst_entry_count++; + continue; + } + + /* Calculate how many bindings this update steps over, so we can split + * up the template entry. This lets the actual update be a simple + * memcpy. + */ + uint32_t remaining = entry->descriptorCount; + const struct tu_descriptor_set_binding_layout *binding_layout = + set_layout->binding + entry->dstBinding; + uint32_t dst_start = entry->dstArrayElement; + do { + uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; + uint32_t count = MIN2(remaining, size - dst_start); + remaining -= count; + binding_layout++; + dst_entry_count++; + dst_start = 0; + } while (remaining > 0); + } + + const size_t size = + sizeof(struct tu_descriptor_update_template) + + sizeof(struct tu_descriptor_update_template_entry) * dst_entry_count; + struct tu_descriptor_update_template *templ; + + templ = vk_object_alloc(&device->vk, pAllocator, size, + VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE); + if (!templ) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + templ->entry_count = dst_entry_count; + + if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) { + templ->bind_point = pCreateInfo->pipelineBindPoint; + } + + uint32_t j = 0; for (uint32_t i = 0; i < entry_count; i++) { const VkDescriptorUpdateTemplateEntry *entry = &pCreateInfo->pDescriptorUpdateEntries[i]; @@ -1180,6 +1349,30 @@ tu_CreateDescriptorUpdateTemplate( case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: dst_offset = binding_layout->dynamic_offset_offset / 4; break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { + uint32_t remaining = entry->descriptorCount; + uint32_t dst_start = entry->dstArrayElement; + uint32_t src_offset = entry->offset; + /* See comment in update_descriptor_sets() */ + do { + dst_offset = + binding_layout->offset + A6XX_TEX_CONST_DWORDS * 4 + dst_start; + uint32_t size = binding_layout->size - A6XX_TEX_CONST_DWORDS * 4; + uint32_t count = MIN2(remaining, size - dst_start); + templ->entry[j++] = (struct tu_descriptor_update_template_entry) { + .descriptor_type = entry->descriptorType, + .descriptor_count = count, + .src_offset = src_offset, + .dst_offset = dst_offset, + }; + remaining -= count; + src_offset += count; + binding_layout++; + dst_start = 0; + } while (remaining > 0); + + continue; + } case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLER: if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR && @@ -1195,7 +1388,7 @@ tu_CreateDescriptorUpdateTemplate( dst_offset += (binding_layout->size * entry->dstArrayElement) / 4; dst_stride = binding_layout->size / 4; - templ->entry[i] = (struct tu_descriptor_update_template_entry) { + templ->entry[j++] = (struct tu_descriptor_update_template_entry) { .descriptor_type = entry->descriptorType, .descriptor_count = entry->descriptorCount, .src_offset = entry->offset, @@ -1207,6 +1400,8 @@ tu_CreateDescriptorUpdateTemplate( }; } + assert(j == dst_entry_count); + *pDescriptorUpdateTemplate = tu_descriptor_update_template_to_handle(templ); @@ -1244,6 +1439,12 @@ tu_update_descriptor_set_with_template( const void *src = ((const char *) pData) + templ->entry[i].src_offset; const struct tu_sampler *samplers = templ->entry[i].immutable_samplers; + if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + memcpy(((uint8_t *) ptr) + templ->entry[i].dst_offset, src, + templ->entry[i].descriptor_count); + continue; + } + ptr += templ->entry[i].dst_offset; unsigned dst_offset = templ->entry[i].dst_offset; for (unsigned j = 0; j < templ->entry[i].descriptor_count; ++j) { diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 115f11355d3..b3d54006ae4 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -74,6 +74,7 @@ struct tu_descriptor_set_layout bool has_immutable_samplers; bool has_variable_descriptors; + bool has_inline_uniforms; /* Bindings in this descriptor set */ struct tu_descriptor_set_binding_layout binding[0]; diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 1a67ac9466d..ef4ddbd66fd 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -234,6 +234,7 @@ get_device_extensions(const struct tu_physical_device *device, .EXT_load_store_op_none = true, .EXT_non_seamless_cube_map = true, .EXT_tooling_info = true, + .EXT_inline_uniform_block = true, }; } @@ -590,8 +591,8 @@ tu_get_physical_device_features_1_3(struct tu_physical_device *pdevice, VkPhysicalDeviceVulkan13Features *features) { features->robustImageAccess = true; - features->inlineUniformBlock = false; - features->descriptorBindingInlineUniformBlockUpdateAfterBind = false; + features->inlineUniformBlock = true; + features->descriptorBindingInlineUniformBlockUpdateAfterBind = true; features->pipelineCreationCacheControl = true; features->privateData = true; features->shaderDemoteToHelperInvocation = true; @@ -1039,13 +1040,19 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice, p->maxComputeWorkgroupSubgroups = 16; /* max_waves */ p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; - /* VK_EXT_inline_uniform_block is not implemented */ - p->maxInlineUniformBlockSize = 0; - p->maxPerStageDescriptorInlineUniformBlocks = 0; - p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 0; - p->maxDescriptorSetInlineUniformBlocks = 0; - p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = 0; - p->maxInlineUniformTotalSize = 0; + /* Inline uniform buffers are just normal UBOs */ + p->maxInlineUniformBlockSize = MAX_UNIFORM_BUFFER_RANGE; + + /* Halve the normal limit on the number of descriptors, see below. */ + p->maxPerStageDescriptorInlineUniformBlocks = max_descriptor_set_size / 2; + p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; + p->maxDescriptorSetInlineUniformBlocks = max_descriptor_set_size / 2; + p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = max_descriptor_set_size / 2; + /* Because we halve the normal limit on the number of descriptors, in the + * worst case each descriptor takes up half the space, leaving the rest for + * the actual data. + */ + p->maxInlineUniformTotalSize = MAX_SET_SIZE / 2; p->integerDotProduct8BitUnsignedAccelerated = false; p->integerDotProduct8BitSignedAccelerated = false; diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index d2391d19fef..27ecf89b720 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -86,6 +86,7 @@ tu6_load_state_size(struct tu_pipeline *pipeline, case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: /* Textures and UBO's needs a packet for each stage */ count = stage_count; break; @@ -202,7 +203,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, offset = (layout->set[i].dynamic_offset_start + binding->dynamic_offset_offset) / 4; FALLTHROUGH; - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: { tu_foreach_stage(stage, stages) { emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO, tu6_stage2shadersb(stage), base, offset, count); diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index c932cc7a39d..a32cec696fa 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -196,9 +196,17 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, break; } - unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS); - assert(util_is_power_of_two_nonzero(stride)); - nir_ssa_def *shift = nir_imm_int(b, util_logbase2(stride)); + nir_ssa_def *shift; + + if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { + /* Inline uniform blocks cannot have arrays so the stride is unused */ + shift = nir_imm_int(b, 0); + } else { + unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS); + assert(util_is_power_of_two_nonzero(stride)); + shift = nir_imm_int(b, util_logbase2(stride)); + } + nir_ssa_def *def = nir_vec3(b, nir_imm_int(b, set), nir_iadd(b, nir_imm_int(b, base), nir_ishl(b, vulkan_idx, shift)),