tu: support KHR_8bit_storage
Add basic KHR_8bit_storage support for Adreno 750 devices, for now enabling the storageBuffer8BitAccess feature. A separate descriptor is provided for 8-bit storage access. The descriptor index is adjusted appropriately for 8-bit SSBO loads and stores. The 8-bit SSBO loads cannot go through isam since that instruction isn't able to handle those. The ldib and stib instruction encodings are a bit peculiar but they match the blob's image buffer access through VK_FORMAT_R8 and the dedicated descriptor. These loads and stores do not work in vectorized form, so they have to be scalarized. Additionally stores of 8-bit values have to clear up higher bits of those values. 8-bit truncation can leave higher bits as undefined. Zero-extension of 8-bit values has to use masking since the corresponding cov instruction doesn't function as intended. 8-bit sign extension through cov from a non-shared to a shared register also doesn't work, so an exception is applied to avoid it. Conversion of 8-bit values to and from floating-point values also doesn't work with a straightforward cov instruction, instead the conversion has to go through a 16-bit value. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9979 Signed-off-by: Zan Dobersek <zdobersek@igalia.com> Reviewed-by: Connor Abbott <cwabbott0@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28254>
This commit is contained in:
@@ -448,7 +448,7 @@ Vulkan 1.1 -- all DONE: anv, lvp, nvk, radv, tu, vn
|
||||
|
||||
Vulkan 1.2 -- all DONE: anv, nvk, tu, vn
|
||||
|
||||
VK_KHR_8bit_storage DONE (anv, dzn, hasvk, lvp, nvk, radv, v3dv, vn)
|
||||
VK_KHR_8bit_storage DONE (anv, dzn, hasvk, lvp, nvk, radv, tu/a750+, v3dv, vn)
|
||||
VK_KHR_buffer_device_address DONE (anv, hasvk, lvp, nvk, panvk, radv, tu, v3dv, vn)
|
||||
VK_KHR_create_renderpass2 DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
|
||||
VK_KHR_depth_stencil_resolve DONE (anv, dzn, hasvk, lvp, nvk, radv, tu, v3dv, vn)
|
||||
|
@@ -271,6 +271,8 @@ struct fd_dev_info {
|
||||
* best thing we could do is a toggle.
|
||||
*/
|
||||
bool enable_tp_ubwc_flag_hint;
|
||||
|
||||
bool storage_8bit;
|
||||
} a7xx;
|
||||
};
|
||||
|
||||
|
@@ -856,6 +856,7 @@ a7xx_750 = A7XXProps(
|
||||
# now.
|
||||
#supports_ibo_ubwc = True,
|
||||
no_gs_hw_binning_quirk = True,
|
||||
storage_8bit = True,
|
||||
)
|
||||
|
||||
a730_magic_regs = dict(
|
||||
|
@@ -1175,12 +1175,14 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
|
||||
/* floating-point conversions when moving from non-shared to shared
|
||||
* seem not to work. We only use floating-point types in ir3 for
|
||||
* conversions, so don't bother specially handling the case where the
|
||||
* types are equal.
|
||||
* types are equal. Same goes for 8-bit sign extension.
|
||||
*/
|
||||
if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
|
||||
!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
|
||||
(full_type(instr->cat1.src_type) == TYPE_F32 ||
|
||||
full_type(instr->cat1.dst_type) == TYPE_F32))
|
||||
((full_type(instr->cat1.src_type) == TYPE_F32 ||
|
||||
full_type(instr->cat1.dst_type) == TYPE_F32) ||
|
||||
(instr->cat1.src_type == TYPE_U8 &&
|
||||
full_type(instr->cat1.dst_type) == TYPE_S32)))
|
||||
return false;
|
||||
|
||||
/* Conversions seem not to work in shared->shared copies before scalar
|
||||
|
@@ -69,7 +69,24 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
ldib->dsts[0]->wrmask = MASK(intr->num_components);
|
||||
ldib->cat6.iim_val = intr->num_components;
|
||||
ldib->cat6.d = 1;
|
||||
ldib->cat6.type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
switch (intr->def.bit_size) {
|
||||
case 8:
|
||||
/* This encodes the 8-bit SSBO load and matches blob's encoding of
|
||||
* imageBuffer access using VK_FORMAT_R8 and the dedicated 8-bit
|
||||
* descriptor. No vectorization is possible.
|
||||
*/
|
||||
assert(intr->num_components == 1);
|
||||
|
||||
ldib->cat6.type = TYPE_U16;
|
||||
ldib->cat6.typed = true;
|
||||
break;
|
||||
case 16:
|
||||
ldib->cat6.type = TYPE_U16;
|
||||
break;
|
||||
default:
|
||||
ldib->cat6.type = TYPE_U32;
|
||||
break;
|
||||
}
|
||||
ldib->barrier_class = IR3_BARRIER_BUFFER_R;
|
||||
ldib->barrier_conflict = IR3_BARRIER_BUFFER_W;
|
||||
|
||||
@@ -100,6 +117,17 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
*/
|
||||
val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
|
||||
|
||||
/* Any 8-bit store will be done on a single-component value that additionally
|
||||
* has to be masked to clear up the higher bits or it will malfunction.
|
||||
*/
|
||||
if (intr->src[0].ssa->bit_size == 8) {
|
||||
assert(ncomp == 1);
|
||||
|
||||
struct ir3_instruction *mask = create_immed_typed(b, 0xff, TYPE_U8);
|
||||
val = ir3_AND_B(b, val, 0, mask, 0);
|
||||
val->dsts[0]->flags |= IR3_REG_HALF;
|
||||
}
|
||||
|
||||
lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val);
|
||||
struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val);
|
||||
|
||||
@@ -107,7 +135,24 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
imm_offset, 0, val, 0);
|
||||
stib->cat6.iim_val = ncomp;
|
||||
stib->cat6.d = 1;
|
||||
stib->cat6.type = intr->src[0].ssa->bit_size == 16 ? TYPE_U16 : TYPE_U32;
|
||||
switch (intr->src[0].ssa->bit_size) {
|
||||
case 8:
|
||||
/* As with ldib, this encodes the 8-bit SSBO store and matches blob's
|
||||
* encoding of imageBuffer access using VK_FORMAT_R8 and the extra 8-bit
|
||||
* descriptor. No vectorization is possible and we have to override the
|
||||
* relevant field anyway.
|
||||
*/
|
||||
stib->cat6.type = TYPE_U16;
|
||||
stib->cat6.iim_val = 4;
|
||||
stib->cat6.typed = true;
|
||||
break;
|
||||
case 16:
|
||||
stib->cat6.type = TYPE_U16;
|
||||
break;
|
||||
default:
|
||||
stib->cat6.type = TYPE_U32;
|
||||
break;
|
||||
}
|
||||
stib->barrier_class = IR3_BARRIER_BUFFER_W;
|
||||
stib->barrier_conflict = IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
|
||||
|
||||
|
@@ -67,6 +67,8 @@ struct ir3_compiler_options {
|
||||
|
||||
/* True if 16-bit descriptors are available. */
|
||||
bool storage_16bit;
|
||||
/* True if 8-bit descriptors are available. */
|
||||
bool storage_8bit;
|
||||
|
||||
/* If base_vertex should be lowered in nir */
|
||||
bool lower_base_vertex;
|
||||
|
@@ -273,6 +273,50 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
|
||||
if (src_type == dst_type)
|
||||
return src;
|
||||
|
||||
/* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
|
||||
* is used to achieve the result.
|
||||
*/
|
||||
if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
|
||||
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
|
||||
struct ir3_instruction *cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
|
||||
cov->dsts[0]->flags |= type_flags(dst_type);
|
||||
return cov;
|
||||
}
|
||||
|
||||
/* Conversion of 8-bit values into floating-point values doesn't work with
|
||||
* a simple `cov`, instead the 8-bit values first have to be converted into
|
||||
* corresponding 16-bit values and converted from there.
|
||||
*/
|
||||
if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
|
||||
assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
|
||||
op == nir_op_u2f32 || op == nir_op_i2f32);
|
||||
|
||||
struct ir3_instruction *cov;
|
||||
if (op == nir_op_u2f16 || op == nir_op_u2f32) {
|
||||
struct ir3_instruction *mask = create_immed_typed(ctx->block, 0xff, TYPE_U8);
|
||||
cov = ir3_AND_B(ctx->block, src, 0, mask, 0);
|
||||
cov->dsts[0]->flags |= IR3_REG_HALF;
|
||||
cov = ir3_COV(ctx->block, cov, TYPE_U16, dst_type);
|
||||
} else {
|
||||
cov = ir3_COV(ctx->block, src, TYPE_U8, TYPE_S16);
|
||||
cov = ir3_COV(ctx->block, cov, TYPE_S16, dst_type);
|
||||
}
|
||||
return cov;
|
||||
}
|
||||
|
||||
/* Conversion of floating-point values to 8-bit values also doesn't work
|
||||
* through a single `cov`, instead the conversion has to go through the
|
||||
* corresponding 16-bit type that's then truncated.
|
||||
*/
|
||||
if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
|
||||
assert(op == nir_op_f2u8 || op == nir_op_f2i8);
|
||||
|
||||
type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
|
||||
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, intermediate_type);
|
||||
cov = ir3_COV(ctx->block, cov, intermediate_type, TYPE_U8);
|
||||
return cov;
|
||||
}
|
||||
|
||||
struct ir3_instruction *cov = ir3_COV(ctx->block, src, src_type, dst_type);
|
||||
|
||||
if (op == nir_op_f2f16_rtne) {
|
||||
@@ -1611,9 +1655,11 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx,
|
||||
{
|
||||
/* Note: we can only use isam for vectorized loads/stores if isam.v is
|
||||
* available.
|
||||
* Note: isam also can't handle 8-bit loads.
|
||||
*/
|
||||
if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
|
||||
(intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
|
||||
(ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
|
||||
!ctx->compiler->has_isam_ssbo) {
|
||||
ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
|
||||
return;
|
||||
|
@@ -416,7 +416,7 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
|
||||
return false;
|
||||
if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
|
||||
return false;
|
||||
} else if (src->cat1.dst_type == TYPE_U16) {
|
||||
} else if (src->cat1.dst_type == TYPE_U16 || src->cat1.dst_type == TYPE_S16) {
|
||||
/* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
|
||||
* what was a U16 value read from the constbuf would incorrectly
|
||||
* do 32f->16f conversion, when we want to read a 16f value.
|
||||
|
@@ -100,6 +100,13 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
|
||||
return true;
|
||||
}
|
||||
|
||||
if ((intrin->intrinsic == nir_intrinsic_load_ssbo &&
|
||||
intrin->def.bit_size == 8) ||
|
||||
(intrin->intrinsic == nir_intrinsic_store_ssbo &&
|
||||
intrin->src[0].ssa->bit_size == 8)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@@ -187,6 +187,11 @@ lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b,
|
||||
(!has_dest && intrinsic->src[0].ssa->bit_size == 16))
|
||||
shift = 1;
|
||||
|
||||
/* for 8-bit ssbo access, offset is in 8-bit words instead of dwords */
|
||||
if ((has_dest && intrinsic->def.bit_size == 8) ||
|
||||
(!has_dest && intrinsic->src[0].ssa->bit_size == 8))
|
||||
shift = 0;
|
||||
|
||||
/* Here we create a new intrinsic and copy over all contents from the old
|
||||
* one. */
|
||||
|
||||
@@ -326,7 +331,8 @@ ir3_nir_max_imm_offset(nir_intrinsic_instr *intrin, const void *data)
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_load_ssbo_ir3:
|
||||
if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER))
|
||||
if ((nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
|
||||
!(compiler->options.storage_8bit && intrin->def.bit_size == 8))
|
||||
return 255; /* isam.v */
|
||||
return 127; /* ldib.b */
|
||||
case nir_intrinsic_store_ssbo_ir3:
|
||||
|
@@ -761,10 +761,14 @@ can_demote_src(struct ir3_instruction *instr)
|
||||
case OPC_META_COLLECT:
|
||||
return false;
|
||||
case OPC_MOV:
|
||||
/* non-shared -> shared floating-point conversions don't work */
|
||||
/* non-shared -> shared floating-point conversions and
|
||||
* 8-bit sign extension don't work.
|
||||
*/
|
||||
return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
|
||||
(full_type(instr->cat1.src_type) != TYPE_F32 &&
|
||||
full_type(instr->cat1.dst_type) != TYPE_F32));
|
||||
!((full_type(instr->cat1.src_type) == TYPE_F32 ||
|
||||
full_type(instr->cat1.dst_type) == TYPE_F32) ||
|
||||
(instr->cat1.src_type == TYPE_U8 &&
|
||||
full_type(instr->cat1.dst_type) == TYPE_S32)));
|
||||
default:
|
||||
return (!is_alu(instr) && !is_sfu(instr)) ||
|
||||
!(instr->dsts[0]->flags & IR3_REG_SHARED);
|
||||
|
@@ -2795,6 +2795,7 @@ tu_bind_descriptor_sets(struct tu_cmd_buffer *cmd,
|
||||
case FMT6_32_UINT:
|
||||
offset_shift = 2;
|
||||
break;
|
||||
case FMT6_8_UINT:
|
||||
default:
|
||||
offset_shift = 0;
|
||||
break;
|
||||
|
@@ -69,7 +69,8 @@ descriptor_size(struct tu_device *dev,
|
||||
*/
|
||||
return A6XX_TEX_CONST_DWORDS * 4 * (1 +
|
||||
COND(dev->physical_device->info->a6xx.storage_16bit &&
|
||||
!dev->physical_device->info->a6xx.has_isam_v, 1));
|
||||
!dev->physical_device->info->a6xx.has_isam_v, 1) +
|
||||
COND(dev->physical_device->info->a7xx.storage_8bit, 1));
|
||||
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
|
||||
return binding->descriptorCount;
|
||||
default:
|
||||
@@ -1016,9 +1017,17 @@ write_buffer_descriptor_addr(const struct tu_device *device,
|
||||
* 16-bit and 32-bit access through isam.v will of course only be functional
|
||||
* when 16-bit storage is supported. */
|
||||
assert(!info->a6xx.has_isam_v || info->a6xx.storage_16bit);
|
||||
/* Any configuration enabling 8-bit storage support will also provide 16-bit
|
||||
* storage support and 16-bit descriptors capable of 32-bit isam loads. This
|
||||
* indirectly ensures we won't need more than two descriptors for access of
|
||||
* any size.
|
||||
*/
|
||||
assert(!info->a7xx.storage_8bit || (info->a6xx.storage_16bit &&
|
||||
info->a6xx.has_isam_v));
|
||||
|
||||
unsigned num_descriptors = 1 + COND(info->a6xx.storage_16bit &&
|
||||
!info->a6xx.has_isam_v, 1);
|
||||
unsigned num_descriptors = 1 +
|
||||
COND(info->a6xx.storage_16bit && !info->a6xx.has_isam_v, 1) +
|
||||
COND(info->a7xx.storage_8bit, 1);
|
||||
memset(dst, 0, num_descriptors * A6XX_TEX_CONST_DWORDS * sizeof(uint32_t));
|
||||
|
||||
if (!buffer_info || buffer_info->address == 0)
|
||||
@@ -1053,6 +1062,18 @@ write_buffer_descriptor_addr(const struct tu_device *device,
|
||||
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
|
||||
dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
|
||||
dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
|
||||
dst += A6XX_TEX_CONST_DWORDS;
|
||||
}
|
||||
|
||||
if (info->a7xx.storage_8bit) {
|
||||
dst[0] = A6XX_TEX_CONST_0_TILE_MODE(TILE6_LINEAR) | A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
|
||||
dst[1] = range;
|
||||
dst[2] =
|
||||
A6XX_TEX_CONST_2_STRUCTSIZETEXELS(1) |
|
||||
A6XX_TEX_CONST_2_STARTOFFSETTEXELS(offset) |
|
||||
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_BUFFER);
|
||||
dst[4] = A6XX_TEX_CONST_4_BASE_LO(base_va);
|
||||
dst[5] = A6XX_TEX_CONST_5_BASE_HI(base_va >> 32);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -142,6 +142,7 @@ get_device_extensions(const struct tu_physical_device *device,
|
||||
struct vk_device_extension_table *ext)
|
||||
{
|
||||
*ext = (struct vk_device_extension_table) { .table = {
|
||||
.KHR_8bit_storage = device->info->a7xx.storage_8bit,
|
||||
.KHR_16bit_storage = device->info->a6xx.storage_16bit,
|
||||
.KHR_bind_memory2 = true,
|
||||
.KHR_buffer_device_address = true,
|
||||
@@ -379,7 +380,7 @@ tu_get_features(struct tu_physical_device *pdevice,
|
||||
/* Vulkan 1.2 */
|
||||
features->samplerMirrorClampToEdge = true;
|
||||
features->drawIndirectCount = true;
|
||||
features->storageBuffer8BitAccess = false;
|
||||
features->storageBuffer8BitAccess = pdevice->info->a7xx.storage_8bit;
|
||||
features->uniformAndStorageBuffer8BitAccess = false;
|
||||
features->storagePushConstant8 = false;
|
||||
features->shaderBufferInt64Atomics = false;
|
||||
@@ -1096,7 +1097,8 @@ tu_get_properties(struct tu_physical_device *pdevice,
|
||||
props->uniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
|
||||
props->robustUniformBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4;
|
||||
props->storageBufferDescriptorSize = A6XX_TEX_CONST_DWORDS * 4 * (1 +
|
||||
COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1));
|
||||
COND(pdevice->info->a6xx.storage_16bit && !pdevice->info->a6xx.has_isam_v, 1) +
|
||||
COND(pdevice->info->a7xx.storage_8bit, 1));
|
||||
props->robustStorageBufferDescriptorSize =
|
||||
props->storageBufferDescriptorSize;
|
||||
props->inputAttachmentDescriptorSize = TU_DEBUG(DYNAMIC) ?
|
||||
@@ -2301,6 +2303,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
|
||||
.bindless_fb_read_descriptor = -1,
|
||||
.bindless_fb_read_slot = -1,
|
||||
.storage_16bit = physical_device->info->a6xx.storage_16bit,
|
||||
.storage_8bit = physical_device->info->a7xx.storage_8bit,
|
||||
.shared_push_consts = !TU_DEBUG(PUSH_CONSTS_PER_STAGE),
|
||||
};
|
||||
device->compiler = ir3_compiler_create(
|
||||
|
@@ -282,15 +282,20 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
|
||||
}
|
||||
}
|
||||
|
||||
/* For isam, we need to adjust the descriptor index to use the 32-bit
|
||||
* descriptor if 16-bit storage support is present but the 16-bit descriptor
|
||||
* cannot be used for 32-bit access through isam.v.
|
||||
/* Descriptor index has to be adjusted in the following cases:
|
||||
* - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
|
||||
* loads -- next-index descriptor will be able to do that;
|
||||
* - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
|
||||
* storage accesses of that size.
|
||||
*/
|
||||
if (dev->physical_device->info->a6xx.storage_16bit &&
|
||||
!dev->physical_device->info->a6xx.has_isam_v &&
|
||||
intrin->intrinsic == nir_intrinsic_load_ssbo &&
|
||||
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
|
||||
intrin->def.bit_size > 16) {
|
||||
if ((dev->physical_device->info->a6xx.storage_16bit &&
|
||||
!dev->physical_device->info->a6xx.has_isam_v &&
|
||||
intrin->intrinsic == nir_intrinsic_load_ssbo &&
|
||||
(nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
|
||||
intrin->def.bit_size > 16) ||
|
||||
(dev->physical_device->info->a7xx.storage_8bit &&
|
||||
((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
|
||||
(intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
|
||||
descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user